In [8]:
from heapq import heapify
import os
import json
from typing import Counter
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

import math

import warnings
warnings.filterwarnings("ignore")

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer, SnowballStemmer, LancasterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag

from sklearn.feature_extraction.text import CountVectorizer
import sklearn.model_selection
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV
import sklearn.preprocessing as preproc
from sklearn.feature_extraction import text
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, ElasticNet, ElasticNetCV, LassoLars, SGDRegressor
from sklearn.metrics import confusion_matrix, mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR

from IPython.display import FileLink, FileLinks

from tqdm import tqdm

!pip install rouge-score
from rouge_score import rouge_scorer

import psutil
import gc
from collections import Counter

[0m

In [9]:

def json_to_df(json_path,type):
    with open(json_path, "r", encoding="utf-8") as f: 
        lines = [eval(l) for l in f.readlines()]

    # exclude lines with surrogates in their text/summary
    surr = [ i for i,l in enumerate(lines) for k in l.keys() if k in ["text","summary"] and re.search(r'[\uD800-\uDFFF]', l[k])!=None ]

    lines = [ l for i,l in zip( range(len(lines)),lines ) if i not in surr ]

    cols=[ "title",	"date",	"text",	"summary", "compression", "coverage", "density", "compression_bin", "coverage_bin"]

    # we need only the extractive summaries as we are building an extractive summarizer
    data=[ [ l[k] for k in l.keys() if k in cols ] for l in lines if l["density_bin"]=="extractive" ]
    df = pd.DataFrame(data,columns=cols)

    df.to_csv(f"..{os.sep}Data{os.sep}DataFrames{os.sep}{type}_set.csv", header=True, index=False )

    return df


# text processing functions

# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { "ain't": "am not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have", "'cause": "because", "could've": "could have", "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'd've": "he would have", "he'll": "he will", "he's": "he is", "how'd": "how did", "how'll": "how will", "how's": "how is", "i'd": "i would", "i'll": "i will", "i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'll": "it will", "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not", "must've": "must have", "mustn't": "must not", "needn't": "need not", "oughtn't": "ought not", "shan't": "shall not", "sha'n't": "shall not", "she'd": "she would", "she'll": "she will", "she's": "she is", "should've": "should have", "shouldn't": "should not", "that'd": "that would", "that's": "that is", "there'd": "there had", "there's": "there is", "they'd": "they would", "they'll": "they will", "they're": "they are", "they've": "they have", "wasn't": "was not", "we'd": "we would", "we'll": "we will", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what're": "what are", "what's": "what is", "what've": "what have", "where'd": "where did", "where's": "where is", "who'll": "who will", "who's": "who is", "won't": "will not", "wouldn't": "would not", "you'd": "you would", "you'll": "you will", "you're": "you are" }


def sentence_cleaning(text, remove_stopwords = True, sub_contractions=True, stemming=True):
    global pbar_cleaning
    pbar_cleaning.update(1)
    
    # Convert words to lower case
    text = text.lower()
    toks = word_tokenize(text)
      
    # Replace contractions with their longer forms 
    if sub_contractions:
        new_text = []
        for word in toks:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
    
    text = " ".join(new_text)

    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)

    toks_clean = word_tokenize(text)
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        toks_clean = [w for w in toks_clean if not w in stops]

    
    if stemming: 
        stemmer=SnowballStemmer(language="english")
        toks_clean=[ stemmer.stem(w) for w in toks_clean ]

    text = " ".join(toks_clean)
    
    return text, toks_clean


def rouge_scoring(sentence,summary,sc_type="rougeL",score="fmeasure"):
    global pbar
    pbar.update(1)
    r_scorer=rouge_scorer.RougeScorer(["rouge1","rouge2","rougeL"])
    r_L=r_scorer.score(summary,sentence)
    score_ind={"precision":0, "recall":1, "fmeasure":2}
    
    return r_L[sc_type][score_ind[score]]


def text_processing(df,data_type,df_dir,sc_type="rougeL"):
    global pbar
    cols=["sentence", "summary", "text"] 
    # new_df=pd.DataFrame()

    # sentence split 
    sentences=[ sent_tokenize(t) for t in df["text"].values ]

    summaries=df["summary"].values
    sent_sum_text=[ [ s,summary,t  ] for s_list,summary,t in zip( sentences, summaries, df["text"] ) for s in s_list ]
    new_df=pd.DataFrame(sent_sum_text, columns=cols)
    new_df["text_id"]=new_df["text"].factorize()[0]
    new_df["chosen"]= 0
    ind = new_df[[ s in t for s,t in zip( new_df["sentence"], new_df["summary"] ) ]].index
    new_df.loc[ind,"chosen"]=1
    del new_df["text"]
    # for c in new_df.columns:
    #   new_df[c]=new_df[c].astype(str)

    # labels
    # columns -> sentence: 0, summary: 1, text: 2
    pbar = tqdm(total=new_df.shape[0] )
    new_df["rougeL"]= new_df.apply(lambda row: rouge_scoring(row["sentence"],row["summary"], sc_type=sc_type, score="fmeasure" ), axis=1)
    print(new_df["rougeL"])

    new_df["summary"].to_csv(os.path.join(df_dir,f"{data_type}_data_{sc_type}_summaries.csv"), header=True, index=False)
    del new_df["summary"]
    
    new_df.to_csv(os.path.join(df_dir,f"{data_type}_data_{sc_type}.csv"), header=True, index=False)

    return new_df


def add_chosen_text_id(df):
    cols=["sentence", "summary", "text"] 
    # new_df=pd.DataFrame()

    # sentence split 
    sentences=[ sent_tokenize(t) for t in df["text"].values ]
    summaries=df["summary"].values
    
    sent_sum_text=[ [ s,summary,t  ] for s_list,summary,t in zip( sentences, summaries, df["text"] ) for s in s_list ]
    new_df=pd.DataFrame(sent_sum_text, columns=cols)
    
    new_df["text_id"]=new_df["text"].factorize()[0]
    new_df["chosen"]= 0
    ind = new_df[[ s in t for s,t in zip( new_df["sentence"], new_df["summary"] ) ]].index
    new_df.loc[ind,"chosen"]=1
    
    del new_df["text"]
    del new_df["summary"]
    del new_df["sentence"]
    
    return new_df


def create_labels(data_dir,df_dir="/kaggle/input/summarizer-data/"):
#     train_df = pd.read_csv(os.path.join(df_dir,"train_set.csv"))
    dev_df = pd.read_csv(os.path.join(df_dir,"dev_set.csv"))
    test_df = pd.read_csv(os.path.join(df_dir,"test_set.csv"))

#     train_df1, train_df2, train_df3, train_df4 = np.array_split(train_df, 4)
    splits=4
#     train_data_list=[]
#     for i,train_df in enumerate(np.array_split(train_df, splits)):
#         train_data = text_processing(train_df,f"train{i+1}",data_dir,"rougeL")
#         train_data_list.append(train_data)

    dev_data = text_processing(dev_df,"dev",data_dir,"rougeL")
    test_data = text_processing(test_df,"test",data_dir,"rougeL")
    
    return train_data_list, dev_data, test_data


def df_add_tid(data_dir, df_dir):
    sc_type="rougeL"
    data_labels_dir="/kaggle/input/summarizer-data"

#     data_type="train"
#     train_df = pd.read_csv(os.path.join(df_dir,f"{data_type}_set.csv"))
#     splits=4
#     for i,df in enumerate(np.array_split(train_df, 4)):
#         if i==2:
#             data_type=f"train{i+1}"
#             print(data_type+"\n")
#             train_ch_tid_df=add_chosen_text_id(df)
#             train_rougeL=pd.read_csv(os.path.join(data_labels_dir,f"{data_type}_data_{sc_type}.csv"))
#             print(train_rougeL.columns, train_rougeL.shape)
#             train_rougeL["text_id"]=train_ch_tid_df["text_id"]
#             train_rougeL["chosen"]=train_ch_tid_df["chosen"]
#             print(train_rougeL.columns, train_rougeL.shape)
#             train_rougeL.to_csv(os.path.join(data_dir,f"{data_type}_data_{sc_type}_tid.csv"), header=True, index=False)

    data_type="dev" 
    dev_df = pd.read_csv(os.path.join(df_dir,f"{data_type}_set.csv"))
    dev_ch_tid_df=add_chosen_text_id(dev_df)
    dev_rougeL=pd.read_csv(os.path.join(data_labels_dir,f"{data_type}_data_{sc_type}.csv"))
    print(dev_rougeL.columns, dev_rougeL.shape)
    dev_rougeL["text_id"]=dev_ch_tid_df["text_id"]
    dev_rougeL["chosen"]=dev_ch_tid_df["chosen"]
    print(dev_rougeL.columns, dev_rougeL.shape)
    dev_rougeL.to_csv(os.path.join(data_dir,f"{data_type}_data_{sc_type}_tid.csv"), header=True, index=False)

    data_type="test"
    test_df = pd.read_csv(os.path.join(df_dir,f"{data_type}_set.csv"))
    test_ch_tid_df=add_chosen_text_id(test_df)
    test_rougeL=pd.read_csv(os.path.join(data_labels_dir,f"{data_type}_data_{sc_type}.csv"))
    print(test_rougeL.columns, test_rougeL.shape)
    test_rougeL["text_id"]=test_ch_tid_df["text_id"]
    test_rougeL["chosen"]=test_ch_tid_df["chosen"]
    print(test_rougeL.columns, test_rougeL.shape)
    test_rougeL.to_csv(os.path.join(data_dir,f"{data_type}_data_{sc_type}_tid.csv"), header=True, index=False)

    
def data_stats(df):
    groups = df.groupby("chosen")
    print(groups.describe()["rougeL"])

    
def thematic_ratio(them_words, word_list):
    them_occ= sum( [ word_list.count(w) for w in set(them_words)&set(word_list)])
    them_ratio=them_occ/len(word_list)
    return them_ratio


def s_position(t_position,tot_sent):
    N=tot_sent
    th=0.2*N
    min_p= th*N
    max_p= 2*th*N
    
    if t_position==tot_sent or t_position==1:
        pos=1.0
    else: 
        pos=math.cos((t_position - min_p)*((1/max_p) - min_p))
        
    return pos
    
    
def prop_nouns(tokens):
    if type(tokens)!=type(list):
        tokens=eval(tokens)
    pos= nltk.pos_tag(tokens)
    tags_count=Counter(tag for _, tag in pos if tag=="NNP" or tag=="NNPS")
    return tags_count["NNP"]+tags_count["NNPS"] 


def feature_df(df, data_dir, data_type):
    global pbar_cleaning
    feat_df=pd.DataFrame()
    
    # tokenize
    pbar_cleaning=tqdm(total=train_data.shape[0], leave=True)
    df["tokens"] = df["sentence"].apply(lambda x: sentence_cleaning(x)[1]) 
    

    # create sentence features
    # 1. thematic words
    col = df.groupby("text_id")["tokens"].apply(sum)
    thematic_cols= pd.DataFrame({"text_id": col.index, "thematic": [ [ t[0] for t in Counter(x).most_common(10) ] for x in col ]})  
    df=df.join(thematic_cols["thematic"], on='text_id' )
    feat_df["thematic_ratio"] = df.apply(lambda row: thematic_ratio(row.thematic, row.tokens) if len(row.tokens)>0 else 0.0, axis=1)

    # 2. sentence position in the text
    feat_df["text_position"] = df.groupby("text_id").cumcount().add(1)
    df["tot_sent"] = df.groupby("text_id")["sentence"].transform(len)
    feat_df["s_position"] = df.apply(lambda row: s_position(row.text_position,row.tot_sent), axis=1)

    # 3. sentence length - threshold=3
    threshold=3
    feat_df["len"]= df["tokens"].apply(lambda x: 0 if len(x)<threshold else len(x))

    # 4. sentence position - paragraph relative
    feat_df['s_pos_par'] = feat_df["s_position"].values
    feat_df.loc[feat_df.s_pos_par!=1.0, 's_pos_par']=0.0

    # 5. numerals
    feat_df["num_ratio"]=df["tokens"].apply(lambda x: sum( [ 1 for t in x if t.isnumeric() ] )/len(x) if len(x)>0 else 0 )


    # ?. Term Frequency-Inverse Sentence Frequency


    # ?. proper nouns - not so useful
    # train_data_feats["NNPs"]=train_data_feats["tokens"].apply(lambda x: prop_nouns(x) )




    # train_data_feats=train_data[["len","text_position"]]
    feat_df.to_csv(os.path.join(data_dir,f"{data_type}_set_feats.csv"), header=True, index=False)

    return feat_df



# SVM_scaler =  StandardScaler()
# LR_scaler =  MinMaxScaler()
# KNN_scaler =  StandardScaler()
# # classifier parameters
# KNN_n_num = 9
# LR_C = 1.0
# SVM_C = 0.001
# algos={
#   "ElNet": make_pipeline(SVM_scaler, SVC(C=SVM_C)),
#   "LR":  make_pipeline(LR_scaler, LogisticRegression(C=LR_C)),
#   "KNN": make_pipeline(KNN_scaler, KNeighborsClassifier(n_neighbors = KNN_n_num)),
# }


def classifier_training(model,X_train,y_train):
    model.fit(X_train,y_train)
    preds=model.predict(X_train)
    c_rep=classification_report(y_train,preds)
    c_rep_dict=classification_report(y_train,preds,output_dict=True)
    return model, c_rep, c_rep_dict

def classifier_validation(model,X_dev,y_dev):
    preds=model.predict(X_dev)
    c_rep = classification_report(y_dev,preds)
    c_rep_dict=classification_report(y_dev,preds,output_dict=True)
    return c_rep, c_rep_dict

def classifier_test(model,X_test,y_test):
    preds=model.predict(X_test)
    c_rep = classification_report(y_test,preds)
    c_rep_dict=classification_report(y_test,preds,output_dict=True)
    return c_rep, c_rep_dict


# Training - Validation - Test pipeline
def classifier_T_V_T(X_train, y_train, X_dev, y_dev, X_test, y_test, algo_type="LR"):
    model=algos[algo_type]

    model,c_rep_train,c_rep_dict_train=classifier_training(model,X_train,y_train)
    c_rep_dev,c_rep_dict_dev=classifier_validation(model,X_dev,y_dev)
    c_rep_test,c_rep_dict_test=classifier_validation(model,X_test,y_test)

    return model, c_rep_train, c_rep_dict_train, c_rep_dev, c_rep_dict_dev, c_rep_test, c_rep_dict_test



In [10]:
# memory clean
# for v in globals():
#     print(v)
#     if str(v) not "__name__":
#         del v
gc.collect()

23

In [11]:
df_dir="/kaggle/input/summarizer-data"

data_dir= "/kaggle/working/Data/DataFrames"
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# load saved dataframes and create labels
# train_data_list, dev_data, test_data = create_labels(data_dir)

# load dataframes with labels
# df_add_tid(data_dir, df_dir)

FileLinks(".")

In [12]:
# load training dataset
sc_type="rougeL"
data_type="train1"
train_data=pd.read_csv(os.path.join(df_dir,f"{data_type}_data_{sc_type}_tid.csv"))
data_stats(train_data)
labels=train_data["rougeL"].values
del train_data["rougeL"]


sc_type="rougeL"
data_type="dev"
dev_data=pd.read_csv(os.path.join(df_dir,f"{data_type}_data_{sc_type}_tid.csv"))
data_stats(dev_data)
labels_dev=dev_data["rougeL"].values
del dev_data["rougeL"]


sc_type="rougeL"
data_type="test"
test_data=pd.read_csv(os.path.join(df_dir,f"{data_type}_data_{sc_type}_tid.csv"))
data_stats(test_data)
labels_test=test_data["rougeL"].values
del test_data["rougeL"]

            count      mean       std  min       25%       50%       75%  max
chosen                                                                       
0       2310412.0  0.105404  0.120294  0.0  0.050000  0.083333  0.122449  1.0
1        126615.0  0.349349  0.234293  0.0  0.177295  0.306569  0.461538  1.0
            count      mean       std  min       25%       50%       75%  max
chosen                                                                       
0       1019122.0  0.109952  0.127384  0.0  0.051948  0.088889  0.128205  1.0
1         29440.0  0.457427  0.297148  0.0  0.216498  0.392638  0.666667  1.0
            count      mean       std  min       25%       50%       75%  max
chosen                                                                       
0       1006718.0  0.109932  0.127422  0.0  0.051948  0.088889  0.128205  1.0
1         29216.0  0.456340  0.299931  0.0  0.210740  0.389262  0.666667  1.0


In [13]:
def load_datasets(df_dir, sc_type="rougeL"):
    # train_data_feats=pd.read_csv("/kaggle/input/summarizer-data/train1_set_feats.csv")
    data_type="train1"
    print(f"Loading {data_type.capitalize()} Data . . ")
    # train_data=feature_df(train_data, data_dir, data_type)
    train_data=pd.read_csv(f"/kaggle/input/summarizer-data/{data_type}_set_feats.csv") 

    train_data_sent=pd.read_csv(os.path.join(df_dir,f"{data_type}_data_{sc_type}_tid.csv"))

    train_data_feats=train_data
    train_data["sentence"]=train_data_sent["sentence"]

    train_data_feats=train_data_feats[["len", "s_position", "thematic_ratio", "s_pos_par", "num_ratio"]]
    # lens= StandardScaler().fit_transform(np.array(train_data_feats["len"]).reshape(-1,1) )
    # train_data_feats["len"]=lens
    # train_data_feats


    data_type="dev"
    print(f"Loading {data_type.capitalize()} Data . . ")
    # dev_data=feature_df(dev_data, data_dir, data_type)
    dev_data=pd.read_csv(f"/kaggle/input/summarizer-data/{data_type}_set_feats.csv") 

    dev_data_sent=pd.read_csv(os.path.join(df_dir,f"{data_type}_data_{sc_type}_tid.csv"))
    dev_data_feats=dev_data
    dev_data["sentence"]=dev_data_sent["sentence"]
    dev_data_feats=dev_data_feats[["len", "s_position", "thematic_ratio", "s_pos_par", "num_ratio"]]


    data_type="test"
    print(f"Loading {data_type.capitalize()} Data . . ")
    # test_data=feature_df(test_data, data_dir, data_type)
    test_data=pd.read_csv(f"/kaggle/input/summarizer-data/{data_type}_set_feats.csv") 

    test_data_sent=pd.read_csv(os.path.join(df_dir,f"{data_type}_data_{sc_type}_tid.csv"))
    test_data_feats=test_data
    test_data["sentence"]=test_data_sent["sentence"]
    test_data_feats=test_data_feats[["len", "s_position", "thematic_ratio", "s_pos_par", "num_ratio"]]
    
    return train_data, train_data_feats, dev_data, dev_data_feats, test_data, test_data_feats




In [34]:
score_type="rougeL"
train_data, train_data_feats, dev_data, dev_data_feats, test_data, test_data_feats = load_datasets(df_dir, sc_type=score_type)

FileLinks(".")

Loading Train1 Data . . 
Loading Dev Data . . 
Loading Test Data . . 


In [None]:
# # tokenize
# global pbar_cleaning
# pbar_cleaning=tqdm(total=train_data.shape[0], leave=True)
# train_data["tokens"] = train_data["sentence"].apply(lambda x: sentence_cleaning(x)[1]) 

# train_data_feats=pd.DataFrame()
# # create sentence features
# # 1. thematic words
# col = train_data.groupby("text_id").tokens.apply(sum)
# thematic_cols= pd.DataFrame({"text_id": col.index, "thematic": [ [ t[0] for t in Counter(x).most_common(10) ] for x in col ]})  
# train_data=train_data.join(thematic_cols["thematic"], on='text_id' )
# train_data_feats["thematic_ratio"] = train_data.apply(lambda row: thematic_ratio(row.thematic, row.tokens) if len(row.tokens)>0 else 0.0, axis=1)

# # 2. sentence position in the text
# train_data["text_position"] = train_data.groupby("text_id").cumcount().add(1)
# train_data["tot_sent"] = train_data.groupby("text_id")["sentence"].transform(len)
# train_data_feats["s_position"] = train_data.apply(lambda row: s_position(row.text_position,row.tot_sent), axis=1)

# # 3. sentence length - threshold=3
# threshold=3
# train_data_feats["len"]= train_data["tokens"].apply(lambda x: 0 if len(x)<threshold else len(x))

# # 4. sentence position - paragraph relative
# train_data_feats['s_pos_par'] = train_data_feats["s_position"].values
# train_data_feats.loc[train_data_feats.s_pos_par!=1.0, 's_pos_par']=0.0

# # 5. numerals
# train_data_feats["num_ratio"]=train_data_feats["tokens"].apply(lambda x: sum( [ 1 for t in eval(x) if t.isnumeric() ] )/len(x) )


# # ?. Term Frequency-Inverse Sentence Frequency


# # ?. proper nouns - not so useful
# # train_data_feats["NNPs"]=train_data_feats["tokens"].apply(lambda x: prop_nouns(x) )




# # train_data_feats=train_data[["len","text_position"]]
# train_data_feats.to_csv(os.path.join(data_dir,f"{data_type}_set_feats.csv"), header=True, index=False)

In [None]:
# train_data_feats=train_data[["len","text_position","thematic_words", ]]
# data_type="train1"
# train_data.to_csv(os.path.join(data_dir,f"{data_type}_set_feats.csv"), header=True, index=False)

In [None]:
# FileLinks(".")

In [59]:
# sc_type="rougeL"
# data_type="train1"
# train_data=pd.read_csv(os.path.join(df_dir,f"{data_type}_data_{sc_type}_tid.csv"))
# data_stats(train_data)
# labels=train_data["rougeL"].values

# sc_type="rougeL"
# data_type="dev"
# dev_data=pd.read_csv(os.path.join(df_dir,f"{data_type}_data_{sc_type}_tid.csv"))
# data_stats(dev_data)
# labels=dev_data["rougeL"].values

            count      mean       std  min       25%       50%       75%  max
chosen                                                                       
0       2310412.0  0.105404  0.120294  0.0  0.050000  0.083333  0.122449  1.0
1        126615.0  0.349349  0.234293  0.0  0.177295  0.306569  0.461538  1.0
            count      mean       std  min       25%       50%       75%  max
chosen                                                                       
0       1019122.0  0.109952  0.127384  0.0  0.051948  0.088889  0.128205  1.0
1         29440.0  0.457427  0.297148  0.0  0.216498  0.392638  0.666667  1.0


In [None]:
# test_num=1000

# model = make_pipeline(MinMaxScaler(), ElasticNet(alpha=5e-3, warm_start=True, random_state=99, fit_intercept=True))
# model.fit(train_data_feats, labels)

# cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# # evaluate model
# scores_mae = cross_val_score(model, train_data_feats.iloc[:test_num,:] , labels[:test_num], scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# scores_mse = cross_val_score(model, train_data_feats.iloc[:test_num,:] , labels[:test_num], scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
# # scores_r2 = cross_val_score(model, train_data_feats.iloc[:test_num,:] , labels[:test_num], scoring='r2', cv=cv, n_jobs=-1)

# # force scores to be positive
# scores_mae = np.absolute(scores_mae)
# scores_mse = np.absolute(scores_mse)
# # scores_r2 = np.absolute(scores_r2)
# print('Mean MAE: %.3f (%.3f)' % (np.mean(scores_mae), np.std(scores_mae)))
# print('Mean MSE: %.3f (%.3f)' % (np.mean(scores_mse), np.std(scores_mse)))
# # print('Mean R2: %.3f (%.3f)' % (np.mean(scores_r2), np.std(scores_r2)))

# print(labels)
# print(model.predict(train_data_feats))
# # svr_results(labels[:test_num],  train_data_feats.iloc[:test_num,:], model)

In [None]:
# test_num=10000

# # model = make_pipeline(MinMaxScaler(), ElasticNetCV(random_state=99))
# model =   make_pipeline(StandardScaler(),ElasticNet(alpha=5e-6, random_state=99, fit_intercept=True))
# model.fit(train_data_feats, labels)

# cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# # evaluate model
# scores_mae = cross_val_score(model, train_data_feats.iloc[:test_num,:] , labels[:test_num], scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# scores_mse = cross_val_score(model, train_data_feats.iloc[:test_num,:] , labels[:test_num], scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
# # scores_r2 = cross_val_score(model, train_data_feats.iloc[:test_num,:] , labels[:test_num], scoring='r2', cv=cv, n_jobs=-1)

# # force scores to be positive
# scores_mae = np.absolute(scores_mae)
# scores_mse = np.absolute(scores_mse)
# # scores_r2 = np.absolute(scores_r2)
# print('Mean MAE: %.3f (%.3f)' % (np.mean(scores_mae), np.std(scores_mae)))
# print('Mean MSE: %.3f (%.3f)' % (np.mean(scores_mse), np.std(scores_mse)))
# # print('Mean R2: %.3f (%.3f)' % (np.mean(scores_r2), np.std(scores_r2)))

# print(labels)
# print(model.predict(train_data_feats))

In [None]:
# test_num=1000

# # model = make_pipeline(MinMaxScaler(), ElasticNetCV(random_state=99))
# eps=0
# model = make_pipeline(StandardScaler(), LinearSVR(epsilon=eps, C=5e-1, fit_intercept=True, intercept_scaling=1.0, loss="squared_epsilon_insensitive"))
# # model = LinearSVR(epsilon=eps, C=5e-4, fit_intercept=True)
# model.fit(train_data_feats, labels)

# cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# # evaluate model
# scores_mae = cross_val_score(model, train_data_feats.iloc[:test_num,:] , labels[:test_num], scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# scores_mse = cross_val_score(model, train_data_feats.iloc[:test_num,:] , labels[:test_num], scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
# # scores_r2 = cross_val_score(model, train_data_feats.iloc[:test_num,:] , labels[:test_num], scoring='r2', cv=cv, n_jobs=-1)

# # force scores to be positive
# scores_mae = np.absolute(scores_mae)
# scores_mse = np.absolute(scores_mse)
# # scores_r2 = np.absolute(scores_r2)
# print('Mean MAE: %.3f (%.3f)' % (np.mean(scores_mae), np.std(scores_mae)))
# print('Mean MSE: %.3f (%.3f)' % (np.mean(scores_mse), np.std(scores_mse)))
# # print('Mean R2: %.3f (%.3f)' % (np.mean(scores_r2), np.std(scores_r2)))

# print(labels)
# print(model.predict(train_data_feats))

In [17]:
# grid search - Dev Set Tuning
def estimator_tuning(X, y, estimator=SGDRegressor(), scaler=StandardScaler()):
    model_grid = make_pipeline(scaler, estimator )

    param_grid = {
        "sgdregressor__alpha": [8.192e-10],
        "sgdregressor__tol": [6.4e-5],
        "sgdregressor__epsilon": [3.2e-4],
        "sgdregressor__loss": ["squared_error"],
        "sgdregressor__penalty": ["elasticnet"],
        "sgdregressor__learning_rate": ["adaptive"]
    }
    # "sgdregressor__alpha": 5.0 ** -np.arange(2, 7)
    # "sgdregressor__tol": 10.0 ** -np.arange(2, 7)
    # "sgdregressor__loss": ["squared_error", "huber", "epsilon_insensitive"]
    # sgdregressor__penalty": ["l2", "l1", "elasticnet"]
    # "sgdregressor__learning_rate": ["constant", "optimal", "invscaling", "adaptive"]

    g_search = GridSearchCV(model_grid, param_grid, verbose=9, return_train_score=True, cv=2)
    g_search.fit(X, y)
    
    with open(os.path.join("/kaggle/working/Data",f"{estimator}_grid_search_results.txt"), "w", encoding="utf-8" ) as writer:
          writer.write(f"Best ParametersL:\n{g_search.best_params_}\n\n\n{g_search.cv_results_}")

    print(f"Best score: { g_search.best_score_}\nParams: {g_search.best_params_}")
    return g_search.best_estimator_


best_model=estimator_tuning(dev_data_feats,labels_dev)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


KeyboardInterrupt: 

### Best sofar

In [25]:
def estimator_training(train_data_feats, labels, test_data_feats, labels_test):
    model = make_pipeline(StandardScaler(), SGDRegressor(alpha=8.192e-10, max_iter=1000, tol=6.4e-5, epsilon=3.2e-4, learning_rate="adaptive", loss="squared_error", penalty="elasticnet"))
    # model = LinearSVR(epsilon=eps, C=5e-4, fit_intercept=True)
    model.fit(train_data_feats, labels)

    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    
    # evaluate model
    test_samples=1000
    # MAE
    scores_mae = cross_val_score(model, test_data_feats.iloc[:test_samples,:] , labels_test[:test_samples], scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
    scores_mae = np.absolute(scores_mae)
    print('Mean MAE: %.3f (%.3f)' % (np.mean(scores_mae), np.std(scores_mae)))
    # MSE
    scores_mse = cross_val_score(model, test_data_feats.iloc[:test_samples,:] , labels_test[:test_samples], scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
    scores_mse = np.absolute(scores_mse)
    print('Mean MSE: %.3f (%.3f)' % (np.mean(scores_mse), np.std(scores_mse)))
    # R2    
    scores_r2 = cross_val_score(model, test_data_feats.iloc[:test_samples,:] , labels_test[:test_samples], scoring='r2', cv=cv, n_jobs=-1)
#     scores_r2 = np.absolute(scores_r2)
    print('Mean R2: %.3f (%.3f)' % (np.mean(scores_r2), np.std(scores_r2)))

#     print(labels_test)
#     print(model.predict(test_data_feats))
    
    return model


model=estimator_training(train_data_feats, labels, test_data_feats, labels_test)

pkl_filepath=os.path.join("/kaggle/working/Data","SGD_model.pkl")
with open(pkl_filepath,"wb") as model_writer:
    pickle.dump(model, model_writer)

ValueError: Found input variables with inconsistent numbers of samples: [2437027, 1035934]

In [20]:
# preds_df=pd.DataFrame({"text_id": train_data["text_id"].values, "text_position": train_data["text_position"].values, "sentence": train_data["sentence"].values, "pred_rougeL": model.predict(train_data_feats)})

# n_largest=preds_df.groupby(["text_id"])["text_position","pred_rougeL"].apply(lambda x: x.nlargest(3,columns=["pred_rougeL"]).sort_index())

In [46]:
train_data.loc[train_data["tot_sent"].idxmax(),"sentence"]

'12 Powdermill Rd.'

In [55]:
def scoring(pred_summary, ref_summary):
    r_scorer=rouge_scorer.RougeScorer(["rouge1","rouge2","rougeL"], use_stemmer=True)
#     rougeL=r_scorer.score(ref_summary,pred_summary)
    rouge=r_scorer.score(ref_summary,pred_summary)
    score_ind={"precision":0, "recall":1, "fmeasure":2}
    return rouge["rouge1"][score_ind["fmeasure"]], rouge["rouge2"][score_ind["fmeasure"]], rouge["rougeL"][score_ind["fmeasure"]]

def summarization(sentences):
    return ".".join(sentences)

def create_summary(df,s_num,th=0.19):
    n_largest=df.groupby(["text_id"])["text_position","sentence","pred_rougeL"].apply(lambda x: x.nlargest(s_num,columns=["pred_rougeL"]).sort_index())
    max_rL=max(n_largest["pred_rougeL"])
    sent = [ n_largest.loc[n_largest["pred_rougeL"].idxmax(),"sentence"] ]
    sent.extend( [ s for s,r in zip(n_largest["sentence"].values, n_largest["pred_rougeL"].values) if r > th and r!=max_rL] )
    summary = summarization( sent)   
    return summary

def get_ref_summary(ref_summaries,doc_id):
    return ref_summaries.loc[ref_summaries.text_id==doc_id, "summary"].values[0]

def get_preds(model,docX):
    return model.predict(docX)

def single_doc_summarizer(model,X,ref_summaries,th,s_num=4):
    global doc_p_bar
    docs=X.groupby("text_id")
    tot_r1=[]
    tot_r2=[]
    tot_rL=[]
    for i,d in enumerate(docs):
        doc_p_bar.update(1)
#         if i>10:
#             break
        document=d[1]
        cols=["len", "s_position", "thematic_ratio", "s_pos_par", "num_ratio"]
        feats=document[cols]

        preds=get_preds(model,feats)
        document["pred_rougeL"]=preds.tolist()
        pred_summary=create_summary(document,s_num,th)
        ref_summary=get_ref_summary(ref_summaries,i)
        r1,r2,rL = scoring(pred_summary,ref_summary)
        tot_r1.append(r1)
        tot_r2.append(r2)
        tot_rL.append(rL)
        
        
        del document
        del feats
        del preds
        del pred_summary
        del ref_summary
        gc.collect()

    return tot_r1, tot_r2, tot_rL



def doc_summary(model,document,ref_summaries,doc_id,th,s_num=4) :
    global doc_p_bar, tot_r1, tot_r2, tot_rL
    
#         if i>10:
#             break
    cols=["len", "s_position", "thematic_ratio", "s_pos_par", "num_ratio"]
    feats=document[cols]


    preds=get_preds(model,feats)
    document["pred_rougeL"]=preds.tolist()
    pred_summary=create_summary(document,s_num,th)
    ref_summary=get_ref_summary(ref_summaries,doc_id)
    r1,r2,rL = scoring(pred_summary,ref_summary)
    tot_r1.append(r1)
    tot_r2.append(r2)
    tot_rL.append(rL)
    doc_p_bar.postfix[1] = np.mean(tot_r1)
    doc_p_bar.postfix[3] = np.mean(tot_r2)
    doc_p_bar.postfix[5] = np.mean(tot_rL)
    doc_p_bar.postfix[6]["value"] = doc_id
    doc_p_bar.update(1)

    del document
    del feats
    del preds
    del pred_summary
    del ref_summary
    gc.collect()
    
    return r1, r2, rL

In [None]:
# sc_type="rougeL"
# data_type="train1"
# train_data=pd.read_csv(os.path.join(df_dir,f"{data_type}_data_{sc_type}_tid.csv"))
# # train_data["text_position"] = train_data.groupby("text_id").cumcount().add(1)
# data_stats(train_data)
# labels=train_data["rougeL"].values

# sc_type="rougeL"
# data_type="test"
# test_data=pd.read_csv(os.path.join(df_dir,f"{data_type}_data_{sc_type}_tid.csv"))
# test_data["text_position"] = test_data.groupby("text_id").cumcount().add(1)
# data_stats(test_data)
# labels=test_data["rougeL"].values

In [38]:
# summaries=pd.read_csv("/kaggle/input/summarizer-data/train_summaries.csv")
# train_len=train_data.groupby("text_id").size().shape[0]

# summaries=summaries.iloc[:train_len,:]
# train_data_feats["summary"]=summaries["summary"]
# train_data_feats["sentence"]=train_data["sentence"]
# train_data_feats["text_id"]=train_data["text_id"]
# train_data_feats["text_position"]=train_data["text_position"]

# X=train_data_feats
# y=pd.DataFrame(labels)


# score_type="rougeL"
# train_data, train_data_feats, dev_data, dev_data_feats, test_data, test_data_feats = load_datasets(df_dir, sc_type=score_type)

sc_type="rougeL"
data_type="test"
test_data=pd.read_csv(os.path.join(df_dir,f"{data_type}_data_{sc_type}_tid.csv"))
test_data["text_position"] = test_data.groupby("text_id").cumcount().add(1)
data_stats(test_data)
labels=test_data["rougeL"].values

summaries=pd.read_csv("/kaggle/input/summarizer-data/test_summaries_grouped.csv")
test_len=test_data.groupby("text_id").size().shape[0]
summaries["text_id"]=summaries.index
if "summary" not in summaries.columns:
    summaries["summary"]=summaries["sum_clean"]
    del summaries["sum_clean"]

# summaries=summaries.iloc[:dev_len,:]
# dev_data_feats["summary"]=summaries["sum_clean"]
test_data_feats["sentence"]=test_data["sentence"]
test_data_feats["text_id"]=test_data["text_id"]
test_data_feats["text_position"]=test_data["text_position"]

X=test_data_feats
y=pd.DataFrame(labels_test)

            count      mean       std  min       25%       50%       75%  max
chosen                                                                       
0       1006718.0  0.109932  0.127422  0.0  0.051948  0.088889  0.128205  1.0
1         29216.0  0.456340  0.299931  0.0  0.210740  0.389262  0.666667  1.0


In [None]:
tot_r1=[]
tot_r2=[]
tot_rL=[]
global doc_p_bar
doc_p_bar=tqdm(total=summaries.shape[0], desc="Dev Set Summary Scoring", bar_format="{postfix[0]}: {postfix[1]:.10f} | {postfix[2]}: {postfix[3]:.10f} | {postfix[4]}: {postfix[5]:.10f} ( {postfix[6][value]}/{postfix[7]} )", postfix=["Mean Rouge1", np.mean(tot_r1), "Mean Rouge2", np.mean(tot_r2), "Mean RougeL", np.mean(tot_rL), dict(value=0) ,summaries.shape[0]], leave=True)

threshold=0.3
scores=X.groupby("text_id").apply(lambda x: doc_summary(model,x,summaries,x.text_id.values[0],threshold) )

print(f"\nMean Rouge1: {np.mean(r1)}\nMean Rouge2: {np.mean(r2)}\nMean RougeL: {np.mean(rL)}\n\n")

Mean Rouge1: 0.0903019697 | Mean Rouge2: 0.0031850259 | Mean RougeL: 0.0671548350 ( 73/36081 )
Mean Rouge1: 0.0885600291 | Mean Rouge2: 0.0030330175 | Mean RougeL: 0.0665844330 ( 125/36081 )

In [152]:
X.groupby("text_id").apply(lambda x: x.text_id.values[0] )

text_id
0            0
1            1
2            2
3            3
4            4
         ...  
36305    36305
36306    36306
36307    36307
36308    36308
36309    36309
Length: 36310, dtype: int64

___
___
# **Notes**


1. scoring -> label rougeL -> sentence feats -> train
2. grid search ?! (or manual fine tuning)
3. test -> input doc -> predict score -> keep N first sentences or keep those over a threshold -> create summary -> calculate rouge1/2/L

___
### **References**

1. [Named Entity Recognition (NER) with TensorflowNamed Entity Recognition (NER) with Tensorflow](https://www.kaggle.com/code/naseralqaydeh/named-entity-recognition-ner-with-tensorflow)
2. [Extractive Summarization using Deep LearningExtractive Summarization using Deep Learning](https://arxiv.org/pdf/1708.04439v1.pdf)
3. [NLTK](https://www.bogotobogo.com/python/NLTK/Stemming_NLTK.php)
4. [Text Features Library](https://github.com/pmbaumgartner/text-feat-lib/tree/master/notebooks)
5. []()



### **Feats**
1. [Feature extraction](https://arxiv.org/pdf/1708.04439v1.pdf)
    1. Number of thematic words
    2. Sentence position
    3. Sentence length
    4. Sentence position relative to paragraph
    5. Number of proper nouns
    6. Number of numerals
    7. Number of named entities
    8. Term Frequency-Inverse Sentence Frequency
    9. Sentence to Centroid similarity
    
    
2. [Text Summarization References](https://github.com/Tian312/awesome-text-summarization/blob/master/README.md)



___
### **Feature Base**

The feature base model extracts the features of the sentence, then evaluate its importance. Here is the representative research.
Sentence Extraction Based Single Document Summarization
Following features are used in the above method.

1. Position of the sentence in the input document
2. Presence of the verb in the sentence
3. Length of the sentence
4. Term frequency
5. Named entity tag NE
6. Font style

…etc. All the features are accumulated as the score.
The No.of coreferences are the number of pronouns to the previous sentence. It is simply calculated by counting the pronouns occurred in the first half of the sentence. So the Score represents the reference to the previous sentence.
Now we can evaluate each sentence. Next is selecting the sentence to avoid the duplicate of the information. In this paper, the same word between the new and selected sentence is considered. And the refinement to connect the selected sentences are executed.
Luhn’s Algorithm is also feature base. It evaluates the “significance” of the word that is calculated from the frequency.
You can try feature base text summarization by TextTeaser (PyTeaser is available for Python user).

# Unused

In [None]:
# train_set = f"..{os.sep}Data{os.sep}release{os.sep}train.jsonl"
# dev_set = f"..{os.sep}Data{os.sep}release{os.sep}dev.jsonl"
# test_set = f"..{os.sep}Data{os.sep}release{os.sep}test.jsonl"
# load json files and convert them to dataframes to load faster next time
# train_df = funs.json_to_df(train_set,"train")
# dev_df = funs.json_to_df(dev_set,"dev")
# test_df = funs.json_to_df(test_set,"test")



In [None]:
# colab command to download the dataset
!kaggle datasets download -d tkylafi/summarizer-data