In [None]:
# Data loading
from google.colab import drive
import zipfile
import pandas as pd
# Preprocessing
import re
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Feature extraction
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec

# Util
from tqdm import tqdm
from pprint import pprint



In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
DATA_PATH = "drive/MyDrive/articles1.csv.zip"

In [None]:
# Extract the CSV file from the zip archive
with zipfile.ZipFile(DATA_PATH, "r") as zip_ref:
    zip_ref.extractall()

# Load the CSV data into a Pandas DataFrame
df = pd.read_csv("articles1.csv")

In [None]:
# Extract content as its preprocessing will differ from other features
articles = df['content']

In [55]:
articles[0]

'WASHINGTON  —   Congressional Republicans have a new fear when it comes to their    health care lawsuit against the Obama administration: They might win. The incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration’s authority to spend billions of dollars on health insurance subsidies for   and   Americans, handing House Republicans a big victory on    issues. But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode, leaving millions of people without access to health insurance before Republicans have prepared a replacement. That could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government. To stave off that outcome, Republicans could find themselves in the awkward position of appropriating huge sums to temporarily prop up the Obama health care law, angering conservative voters who have been 

In [140]:
def preprocess_articles(articles):
  articles_preprocessed = []
  lemmatizer = WordNetLemmatizer()

  for article in tqdm(articles[:10], desc='Preprocessing articles'):
    # Tokenize
    words = [word.lower() for word in word_tokenize(article)]
    
    # Remove stop words & punctuation 
    words = [word for word in words if word \
             not in stop_words and word not in string.punctuation]
    # Lemmatize
    words = [lemmatizer.lemmatize(word) for word in words]

    words = ' '.join(words)

    articles_preprocessed.append(words)
  

  return articles_preprocessed


In [141]:
preprocessed = preprocess_articles(articles)

Preprocessing articles: 100%|██████████| 10/10 [00:00<00:00, 50.57it/s]


In [136]:
def extract_features(articles, method='tf-idf'):
    if method == 'tf-idf':
        vectorizer = TfidfVectorizer()
        feature_vectors = vectorizer.fit_transform(articles[:10])
        feature_names = vectorizer.get_feature_names_out()
    elif method == 'countvectorizer':
        vectorizer = CountVectorizer()
        feature_vectors = vectorizer.fit_transform(articles)
        feature_names = vectorizer.get_feature_names_out()
    else:
        raise ValueError('Invalid method specified.')
    # TODO:
      # Add word2vec
    
    return feature_vectors, feature_names

In [137]:
vectors, names = extract_features(preprocessed)

In [138]:
len(names)

3670

In [139]:
for name in names:
  print(name)

000
10
100
101
106
11
115
12
125
1279
13
14
146th
15
150
151st
152
157
16
17
176
18
184
1882
19
1906
191
1910
1920
1921
1923
1930s
1932
1934
1936
1937
1938
1940s
1941
1942
1946
1949
1951
1952
1955
1960s
1968
1969
1980s
1983
1987
1988
1991
1995
1996
20
200
2000
2001
2003
2005
2007
2009
2010
2011
2012
2013
2014
2015
2016
2017
20th
21
22
235
239
24
240
245
25
255
263
265
27
275
276
289
29
295
30
300
300s
31
32
343
35
36
360
37
370
38
40
400
40th
4100
42
430
444
45
450
458
46
460
485
50
52
521
53
54
550
552
57
58
600
62
63
64
65
66
67
68
69
700
71
72
74
76
77
770
79
80
800
81
83
83rd
84
86
88
90
900
911
96
960
abbas
abe
ability
able
absent
academy
accepted
access
accessory
accidentally
acclaim
according
accordingly
account
accounting
accurate
acknowledge
acknowledged
acolyte
acra
across
act
acting
action
active
activist
activity
actual
actually
adam
adamantly
adapting
added
addict
adding
addition
additional
address
adequate
administration
admired
admission
admits
adult
adulyadej
advancing
