# Text Vectorization and Feature Engineering Assignment

In [1]:
import nltk
import pandas as pd

from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\c\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [4]:
path = "cnn_lite/"
doc_pattern = r".*\.txt"
corpus = PlaintextCorpusReader(path, doc_pattern)

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [6]:
docs = [corpus.raw(id_) for id_ in corpus.fileids()]

### Preprocess and clean the documents according to the steps below.

- Word Tokenize
- Lowercase
- Remove Stopwords
- Remove Punctuation
- Lemmatize
- Stem

In [9]:
# word tokenize
word_tokenized = [word_tokenize(doc) for doc in docs]
# lowercase transform and removing stopwords
lowercase_trans = [[token.lower() for token in doc if token.lower() not in stopwords.words("english")] for doc in word_tokenized]
# removing punctuations
no_punct = [[token for token in doc if token.isalpha()] for doc in lowercase_trans]
# lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized = [[lemmatizer.lemmatize(token) for token in doc] for doc in no_punct]
# stemming
stemmer = SnowballStemmer("english")
stemmed = [[stemmer.stem(token) for token in doc] for doc in lemmatized]

### Count vectorize the preprocessed documents.

In [10]:
proc_docs = [
    ' '.join(word for word in doc) for doc in stemmed
]

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(proc_docs)
count_vectorized = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())
count_vectorized

Unnamed: 0,aaron,aback,abc,abet,abil,abl,abroad,absolut,absorb,absorpt,...,yudkin,yuriy,zaia,zaid,zakaria,zanoni,zelenski,zero,zone,édouard
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,0,0,3,0,0,1,0,0,0,0,...,0,0,0,1,0,0,3,0,0,0
8,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### One hot vectorize the preprocessed documents.

In [12]:
vectorizer = CountVectorizer(binary=True)
vecotrs = vectorizer.fit_transform(proc_docs)
one_hot = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())
one_hot

Unnamed: 0,aaron,aback,abc,abet,abil,abl,abroad,absolut,absorb,absorpt,...,yudkin,yuriy,zaia,zaid,zakaria,zanoni,zelenski,zero,zone,édouard
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,0,0,3,0,0,1,0,0,0,0,...,0,0,0,1,0,0,3,0,0,0
8,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TF-IDF vectorize the preprocessed documents.

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(proc_docs)
tfidf = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())
tfidf

Unnamed: 0,aaron,aback,abc,abet,abil,abl,abroad,absolut,absorb,absorpt,...,yudkin,yuriy,zaia,zaid,zakaria,zanoni,zelenski,zero,zone,édouard
0,0.0,0.0,0.0,0.0,0.023331,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.017877,0.0,0.0,0.0,0.01172,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035754,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028956,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.029829,0.0,0.0,0.0,0.045498,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.046811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.022299,0.0,0.0,0.005372,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.008194,0.0,0.0,0.016115,0.0,0.0,0.0
8,0.026109,0.0,0.0,0.0,0.0,0.0,0.0,0.021541,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Use Doc2Vec to vectorize the preprocessed documents.

Set the size of the vectors to be the same size as those of the other methods using the `vector_size` argument.

In [15]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(proc_docs)]
docs[0]

TaggedDocument(words='de moin iowa cnn alexandria cap debut iowa hust support berni sander last week blunt call action someth allow happen u let race happen u said ralli vermont senat council bluff watch presidenti race movi movement yet certain cinemat qualiti last six week sander second democrat presidenti campaign near sidelin wors heart attack la vega first night octob sander chart remark reviv power run invigor endors new poll result show gain steam new hampshir iowa sens fuel part massiv crowd welcom recent ralli new york minnesota polit revolut tri summer back march also sander joke follow saturday climat summit de moin stent thank got three arteri work right pretti good deadpan practic jumper basketbal court drake univers better one block arteri feel realli good support staff say much back along fellow squad member ilhan omar rashida tlaib news broke recent debat last month ohio bolster argument sander make month campaign steadili attract racial divers young work class coalit r

In [16]:
model = Doc2Vec(docs, vector_size=tfidf.shape[1])
vectors = [model[i] for i in range(len(docs))]
doc2vec = pd.DataFrame(vectors)
doc2vec

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4644,4645,4646,4647,4648,4649,4650,4651,4652,4653
0,0.001532,0.006135,0.02906,-0.001854,-0.002265,-0.002942,0.000536,-0.010728,-0.01109,-0.001314,...,0.011713,0.00044,0.01418,-0.005127,0.011412,-0.003137,-0.01027,-0.009217,-0.021734,0.002756
1,0.001881,0.007152,0.031424,-0.002053,-0.002732,-0.003276,0.000502,-0.011479,-0.011849,-0.001345,...,0.011561,0.000419,0.014169,-0.005159,0.011239,-0.002969,-0.010361,-0.009318,-0.021483,0.002818
2,0.002077,0.006981,0.029707,-0.002018,-0.002799,-0.003706,0.000495,-0.011665,-0.012071,-0.001317,...,0.010227,0.000437,0.012622,-0.004357,0.01,-0.002474,-0.009257,-0.008275,-0.018912,0.002618
3,0.001875,0.00676,0.029817,-0.002074,-0.002984,-0.003715,0.000701,-0.012995,-0.013468,-0.001735,...,0.012974,0.000663,0.016038,-0.00567,0.012863,-0.003563,-0.01166,-0.010536,-0.024351,0.00331
4,0.00192,0.00715,0.031547,-0.002143,-0.003086,-0.00384,0.000694,-0.013363,-0.013956,-0.00154,...,0.01262,0.000401,0.015407,-0.005501,0.012331,-0.003216,-0.011195,-0.010118,-0.023475,0.003188
5,0.001805,0.007052,0.031174,-0.002155,-0.002831,-0.003335,0.000484,-0.012099,-0.012461,-0.001611,...,0.011779,0.000203,0.014165,-0.005318,0.011333,-0.002996,-0.010294,-0.009385,-0.02179,0.002893
6,0.002036,0.006801,0.027969,-0.002026,-0.003047,-0.003655,0.000731,-0.012402,-0.012923,-0.001611,...,0.01122,0.000266,0.013771,-0.004771,0.010895,-0.002676,-0.010204,-0.009013,-0.020652,0.002867
7,0.0023,0.006647,0.02738,-0.001996,-0.002416,-0.00274,0.00096,-0.010384,-0.01097,-0.001402,...,0.011293,0.000376,0.013773,-0.004679,0.010912,-0.003105,-0.010307,-0.009016,-0.020725,0.002983
8,0.001829,0.006108,0.026168,-0.001737,-0.003085,-0.003628,0.000589,-0.012353,-0.01299,-0.001369,...,0.011185,0.000301,0.013835,-0.005017,0.01104,-0.002666,-0.010025,-0.008998,-0.020842,0.00286
9,0.002004,0.006801,0.030464,-0.002021,-0.002478,-0.003255,0.000586,-0.011319,-0.011667,-0.001282,...,0.011645,0.000385,0.014018,-0.005053,0.011084,-0.003063,-0.010223,-0.009211,-0.021246,0.002877
