In [3]:
!pip install -U 'spacy[cuda-autodetect]'
!pip install tqdm
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-2.0.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (103 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.2/103.2 KB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting anyascii
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 KB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contr

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
import pandas as pd
import networkx as nx
import re

from tqdm import tqdm
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

import spacy
gpu = spacy.prefer_gpu()
print('GPU:', gpu)

import contractions

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


GPU: True


In [5]:
train_df = pd.read_csv("./raw-data/train.csv")


In [6]:
## load the full dataset of 300k articles
train_df.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [7]:
train_df.rename(columns={'highlights':'summary'}, inplace=True)

In [8]:
df = train_df.sample(n=22000)

In [9]:


# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])


In [10]:
def lemmatize_text(t):
    t = re.sub(r"\s+", " ", t)
    ### slang
    t = contractions.fix(t)


    # Parse the sentence using the loaded 'en' model object `nlp`
    doc = nlp(t)
    # Extract the lemma for each token and join
    return " ".join([token.lemma_ for token in doc])

In [11]:
df['article_lemmatized'] = df['article'].progress_apply(lemmatize_text)
df['summary_lemmatized'] = df['summary'].progress_apply(lemmatize_text)

  0%|          | 0/22000 [00:00<?, ?it/s]

  0%|          | 0/22000 [00:00<?, ?it/s]

In [12]:
df.head(1)

Unnamed: 0,id,article,summary,article_lemmatized,summary_lemmatized
29447,53b1206f8620d661ebc42f49ae2db71316a5136d,(CNN) -- The Senate Homeland Security Committe...,New: Rep. King says he declined a meeting with...,( CNN ) -- the Senate Homeland Security Commit...,new : Rep. King say he decline a meeting with ...


In [14]:
nltk.download('stopwords')
STOPWORDS = stopwords.words("english")
## add words that are too frequent
STOPWORDS = STOPWORDS + ["cnn","say","said","new","all", "due", "to", "on", "daily"]
## cleaning function
def clean_text(t, stopwords=None):
    ### separate sentences with '. '
    t = re.sub(r'\.(?=[^ \W\d])', '. ', str(t))
    ### remove punctuations and characters
    t = re.sub(r'[^\w\s]', '', t) 
    ### strip
    t = " ".join([word.strip() for word in t.split()])
    ### lowercase
    t = t.lower()

    ### tokenize (convert from string to list)
    t_list = t.split()
    
    ### remove Stopwords
    if stopwords is not None:
      t_list = [word for word in t_list if word not in 
                    stopwords]
    ### back to string
    t = " ".join(t_list)
    return t



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [19]:
df['article_cleaned'] = df['article_lemmatized'].progress_apply(clean_text, stopwords=STOPWORDS)
df['summary_cleaned'] = df['summary_lemmatized'].progress_apply(clean_text, stopwords=STOPWORDS)

  0%|          | 0/22000 [00:00<?, ?it/s]

  0%|          | 0/22000 [00:00<?, ?it/s]

In [20]:
df.head(1)

Unnamed: 0,id,article,summary,article_lemmatized,summary_lemmatized,article_cleaned,summary_cleaned
29447,53b1206f8620d661ebc42f49ae2db71316a5136d,(CNN) -- The Senate Homeland Security Committe...,New: Rep. King says he declined a meeting with...,( CNN ) -- the Senate Homeland Security Commit...,new : Rep. King say he decline a meeting with ...,senate homeland security committee schedule pu...,rep king decline meeting one prostitute lieber...


In [21]:
from sklearn.model_selection import train_test_split


train_df, test_df = train_test_split(df, test_size=2000)
train_df, val_df = train_test_split(train_df, test_size=4000)
print(len(train_df))
print(len(val_df))
print(len(test_df))

16000
4000
2000


In [22]:
train_df.to_csv('./processed-data/train.csv')
val_df.to_csv('./processed-data/val.csv')
test_df.to_csv('./processed-data/test.csv')