## Import of needed libraries and config file

In [None]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.8/110.8 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contrac

In [None]:
from google.colab import drive
import pandas as pd
import contractions
import nltk
nltk.download('punkt') # one time execution
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import sent_tokenize, RegexpTokenizer
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import json
with open('/content/gdrive/MyDrive/TextMiningProj_Maugeri_Morelli/Topic modeling/config.json', 'r') as f:
  config = json.load(f)

## Import of raw files
These files are the ones downloaded from [Kaggle](https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail).

In [None]:
train = pd.read_csv(config["path_train_cleaned"]) 
val = pd.read_csv(config["path_val_cleaned"]) 
test = pd.read_csv(config["path_test_cleaned"])

In [None]:
train.head(5)

Unnamed: 0.1,Unnamed: 0,id,source_text,target_text,cleaned_text
0,0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ...",The bishop of the Fargo Catholic Diocese in N...
1,1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...,Ralph Mata was an internal affairs lieutenant ...
2,2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t...",A drunk driver who killed a young woman in a h...
3,3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...,With a breezy sweep of his pen President Vladi...
4,4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...,Fleetwood are the only team still to have a 10...


## Concatenation of the datasets
The CNN dataset comes divided in three parts, the train, the validation and the test. Anyway, since the Topic Modeling is an unsupervised learning technique, we will concatenate them in order to have just one big dataset to train the models.

In [None]:
texts = pd.concat([train['cleaned_text'], val['cleaned_text'], test['cleaned_text']], ignore_index=True)

In [None]:
type(texts)

pandas.core.series.Series

## Preprocessing
Definition of the functions that are later applied to the dataset to preprocess it.

In [None]:
def preprocess_text(txt, punkt=True, lower=True, contr=True, lst_stopwords=None, stemm=False, lemm=True):

    # Clean 
    txt = re.sub(r'\.(?=[^ \W\d])', '. ', str(txt)) # separate sentences with '. '
    txt = re.sub(r'[^\w\s]', '', txt) if punkt is True else txt # remove punctuations and characters
    txt = re.sub('"','', txt) # removing double quotes
    txt = re.sub(r"'s\b","", txt) # eliminating apostrophe.  
    txt = " ".join([word.strip() for word in txt.split()]) # strip
    txt = txt.lower() if lower is True else txt # lowercase
    txt = contractions.fix(txt) if contr is True else txt # exand contractions
    
    ## Tokenize (convert from string to list)
    lst_txt = txt.split()
                
    ## Stemming (remove -ing, -ly, ...)
    if stemm is True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_txt = [ps.stem(word) for word in lst_txt]
                
    ## Lemmatization (convert the word into root word)
    if lemm is True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_txt = [lem.lemmatize(word) for word in lst_txt]

    ## Stopwords
    if lst_stopwords is not None:
        lst_txt = [word for word in lst_txt if word not in lst_stopwords]
            
    # Back to string
    txt = " ".join(lst_txt)
    
    return txt

In [None]:
# Apply preprocessing function
def apply_preprocess(df, punkt=False, lower=False, contr=True, lst_stopwords=None, stemm=False, lemm=False, remove_na=True):
    
    df_cleaned = df.copy()

    ## apply preprocess
    for col in df_cleaned:
      df_cleaned = df_cleaned[ pd.notnull(df_cleaned[col]) ]
      df_cleaned[col+"_clean"] = df_cleaned[col].apply(lambda x: preprocess_text(x, punkt, lower, contr, lst_stopwords, stemm, lemm))
      
      ## residuals
      df_cleaned["check"] = df_cleaned[col+"_clean"].apply(lambda x: len(x))
      if df_cleaned["check"].min() == 0:
          print("--- found NAs ---")
          print(df_cleaned[[col,col+"_clean"]][df_cleaned["check"]==0].head())
          if remove_na is True:
              df_cleaned = df_cleaned[df_cleaned["check"]>0] 
            
    return df_cleaned.drop("check", axis=1)

In [None]:
def create_stopwords(lst_langs=["english"], lst_add_words=[], lst_keep_words=[]):
    lst_stopwords = set()
    for lang in lst_langs:
        lst_stopwords = lst_stopwords.union( set(nltk.corpus.stopwords.words(lang)) )
    lst_stopwords = lst_stopwords.union(lst_add_words)
    lst_stopwords = list(set(lst_stopwords) - set(lst_keep_words))
    return sorted(list(set(lst_stopwords)))

Definition of the stopwords list:

In [None]:
# stopwords list
lst_stopwords = create_stopwords() + ["cnn", "new", "wa", "ha", "said", "mr", "would", "also", "per", "cent", "one"]
print(lst_stopwords)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shouldn't", 'so', 'some',

Preprocess of the documents:

In [None]:
texts = texts.to_frame()

In [None]:
texts = apply_preprocess(texts, punkt=True, lower=True, lst_stopwords=lst_stopwords, stemm=False, lemm=True, remove_na=True)

--- found NAs ---
       cleaned_text cleaned_text_clean
77524         (CNN)                   
124491        (CNN)                   
191099        (CNN)                   
246266        (CNN)                   
279294         When                   


In [None]:
len(texts)

311952

## Save data to cvs
This let us import the preprocessed file later on, without the need of running the preprocessing again.

In [None]:
# Save data to csv
texts.to_csv("/content/gdrive/MyDrive/TextMiningProj_Maugeri_Morelli/Topic modeling/texts_preproc.csv")