# Text Vectorization and Feature Engineering Assignment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [30]:
import pandas as pd
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [4]:
PATH = '/content/drive/MyDrive/python_for_data_scientists/cnn_lite'
DOC_PATTERN = r'.*\.txt'
cnn = PlaintextCorpusReader(PATH , DOC_PATTERN)

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [5]:
docs = [cnn.raw(fileid) for fileid in cnn.fileids()]

### Preprocess and clean the documents according to the steps below.

- Word Tokenize
- Lowercase
- Remove Stopwords
- Remove Punctuation
- Lemmatize
- Stem

In [27]:
tokens = [word_tokenize(doc) for doc in docs]
no_stop = [[token.lower() for token in doc if token.lower() not in stopwords.words('english')] for doc in tokens]
no_punc = [[token.lower() for token in doc if token.isalpha()] for doc in no_stop]

lemmatizer = WordNetLemmatizer()
lemma = [[lemmatizer.lemmatize(token) for token in doc] for doc in no_punc]

stemmer = SnowballStemmer('english')
stemmed = [[stemmer.stem(token) for token in doc] for doc in lemma]

In [29]:
print(len(docs))
print(len(no_stop))
print(len(no_punc))
print(len(lemma))
print(len(stemmed))

57
57
57
57
57


### Count vectorize the preprocessed documents.

In [34]:
processed = [' '.join(word for word in doc) for doc in stemmed]

In [37]:
vectorize = CountVectorizer()
vectors = vectorize.fit_transform(processed)

count = pd.DataFrame(vectors.toarray(), columns=vectorize.get_feature_names())
count.head()

Unnamed: 0,aaron,aback,abc,abet,abil,abl,abroad,absolut,absorb,absorpt,absurd,abund,abus,academ,academi,accent,accept,access,accid,accident,accompani,accomplish,accord,account,accumul,accur,accuraci,accus,achiev,acid,acknowledg,acquaint,acquisit,acquit,across,act,action,activ,activist,actor,...,wri,write,writer,written,wrong,wrongdo,wrote,wurst,wyom,xi,xinhua,xyz,ya,yanke,yawn,yea,year,yell,yellow,yermak,yes,yet,york,yorker,young,younger,youngster,youth,youtub,yovanovitch,yudkin,yuriy,zaia,zaid,zakaria,zanoni,zelenski,zero,zone,édouard
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,16,1,0,0,2,2,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0
3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


### One hot vectorize the preprocessed documents.

In [38]:
vectorize1 = CountVectorizer(binary=True)
vectors1 = vectorize1.fit_transform(processed)

one_hot = pd.DataFrame(vectors1.toarray(), columns=vectorize1.get_feature_names())
one_hot.head()

Unnamed: 0,aaron,aback,abc,abet,abil,abl,abroad,absolut,absorb,absorpt,absurd,abund,abus,academ,academi,accent,accept,access,accid,accident,accompani,accomplish,accord,account,accumul,accur,accuraci,accus,achiev,acid,acknowledg,acquaint,acquisit,acquit,across,act,action,activ,activist,actor,...,wri,write,writer,written,wrong,wrongdo,wrote,wurst,wyom,xi,xinhua,xyz,ya,yanke,yawn,yea,year,yell,yellow,yermak,yes,yet,york,yorker,young,younger,youngster,youth,youtub,yovanovitch,yudkin,yuriy,zaia,zaid,zakaria,zanoni,zelenski,zero,zone,édouard
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


### TF-IDF vectorize the preprocessed documents.

In [39]:
vectorize2 = TfidfVectorizer()
vectors2 = vectorize2.fit_transform(processed)

tf_idf = pd.DataFrame(vectors2.toarray(), columns=vectorize2.get_feature_names())
tf_idf.head()

Unnamed: 0,aaron,aback,abc,abet,abil,abl,abroad,absolut,absorb,absorpt,absurd,abund,abus,academ,academi,accent,accept,access,accid,accident,accompani,accomplish,accord,account,accumul,accur,accuraci,accus,achiev,acid,acknowledg,acquaint,acquisit,acquit,across,act,action,activ,activist,actor,...,wri,write,writer,written,wrong,wrongdo,wrote,wurst,wyom,xi,xinhua,xyz,ya,yanke,yawn,yea,year,yell,yellow,yermak,yes,yet,york,yorker,young,younger,youngster,youth,youtub,yovanovitch,yudkin,yuriy,zaia,zaid,zakaria,zanoni,zelenski,zero,zone,édouard
0,0.0,0.0,0.0,0.0,0.023331,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028279,0.020437,0.0,0.0,0.0,0.021278,0.0,0.022231,0.0,0.0,0.0,...,0.031173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019005,0.046469,0.0,0.055151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057781,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041758,0.0,0.0,...,0.0,0.045424,0.0,0.0,0.0,0.0,0.038832,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.017877,0.0,0.0,0.0,0.01172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012202,0.009629,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.010899,0.0,0.021798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.106524,0.01504,0.0,0.0,0.02676,0.021798,0.026649,0.0,0.010543,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035754,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.029829,0.0,0.0,0.0,0.045498,0.0,0.0,0.0,0.041274,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026832,0.0,0.045498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Use Doc2Vec to vectorize the preprocessed documents.

Set the size of the vectors to be the same size as those of the other methods using the `vector_size` argument.

In [40]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(processed)]
model = Doc2Vec(documents, vector_size=count.shape[1])

doc2vec = pd.DataFrame([[document]+list(model[document]) for document in range(len(docs))]).drop(0, axis=1)
doc2vec.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,4615,4616,4617,4618,4619,4620,4621,4622,4623,4624,4625,4626,4627,4628,4629,4630,4631,4632,4633,4634,4635,4636,4637,4638,4639,4640,4641,4642,4643,4644,4645,4646,4647,4648,4649,4650,4651,4652,4653,4654
0,0.002184,0.003847,-0.001884,-0.002581,0.005256,0.002785,0.005886,0.003447,-0.006418,-0.005917,-0.009206,-0.008609,0.004381,0.001832,-0.000782,-0.018698,0.000702,0.009771,-0.004555,0.012206,0.001721,0.005698,0.006481,0.004806,-0.017498,0.010432,-0.000974,-0.001991,0.01475,0.00172,-0.019357,0.003157,-0.013639,-0.009954,-0.005449,0.012792,-0.008282,-0.007507,0.002337,0.024816,...,0.002369,0.00514,-0.011963,-0.001069,-0.006175,0.003203,-0.019139,0.010089,-0.020645,-0.015056,-0.005949,-0.009276,0.000299,-0.002644,-0.000768,-5.4e-05,0.014801,0.010997,0.013555,-0.010872,-0.016436,-0.014079,-0.02334,-0.009297,0.007809,0.003499,0.016815,0.010443,-0.008921,-0.009702,-0.000389,-0.003625,-0.000516,-0.006254,0.003899,0.021192,0.004566,0.012938,0.0142,0.002344
1,0.002693,0.004514,-0.002345,-0.002752,0.006336,0.003453,0.007126,0.004492,-0.007739,-0.007265,-0.011305,-0.010194,0.005312,0.002082,-0.001307,-0.022278,0.000932,0.011655,-0.005409,0.014336,0.001848,0.006315,0.007583,0.005511,-0.020834,0.012533,-0.001485,-0.002655,0.017777,0.00176,-0.023164,0.004099,-0.016977,-0.012308,-0.006621,0.015971,-0.010483,-0.009527,0.002936,0.02878,...,0.002436,0.005013,-0.012013,-0.000984,-0.006537,0.003437,-0.02083,0.010817,-0.022258,-0.01634,-0.006464,-0.009519,0.000456,-0.002803,-0.000583,0.00014,0.01498,0.011502,0.014699,-0.011994,-0.017835,-0.015181,-0.024157,-0.009878,0.008235,0.003492,0.017961,0.010923,-0.009753,-0.009982,-0.000574,-0.003991,-0.000562,-0.006725,0.003985,0.022563,0.004641,0.013728,0.015063,0.002363
2,0.000766,0.002824,-0.001631,-0.002083,0.00296,0.001681,0.00452,0.002263,-0.00495,-0.003623,-0.005611,-0.005779,0.003088,0.000999,-7.5e-05,-0.01273,0.000177,0.007448,-0.002574,0.008957,0.001493,0.004527,0.005218,0.003202,-0.012393,0.006955,-0.000551,-0.001257,0.010192,0.001769,-0.013619,0.001698,-0.006357,-0.004389,-0.002626,0.006039,-0.003648,-0.003037,0.00068,0.011936,...,0.001959,0.004949,-0.012194,-0.001537,-0.005139,0.002719,-0.018786,0.010203,-0.020684,-0.015253,-0.005583,-0.008705,-4.1e-05,-0.001768,-0.001405,-7.2e-05,0.014399,0.010667,0.013234,-0.010948,-0.016753,-0.014088,-0.021874,-0.008722,0.007402,0.002978,0.01604,0.010321,-0.009591,-0.008766,9.6e-05,-0.003497,-0.00037,-0.005849,0.004389,0.020534,0.004289,0.01286,0.013997,0.002467
3,0.002123,0.004303,-0.002313,-0.002875,0.005562,0.003001,0.006735,0.004045,-0.007477,-0.00668,-0.009945,-0.009357,0.004853,0.001711,-0.000995,-0.020698,0.000755,0.011088,-0.004616,0.013618,0.002024,0.006181,0.007297,0.005064,-0.019439,0.011405,-0.001145,-0.002306,0.016367,0.001982,-0.021573,0.003493,-0.014312,-0.010324,-0.005622,0.013365,-0.008635,-0.007855,0.00226,0.024319,...,0.002312,0.005399,-0.012869,-0.001268,-0.006419,0.003409,-0.02155,0.011285,-0.023398,-0.017131,-0.006485,-0.009848,0.000427,-0.002739,-0.000832,-6.4e-05,0.015839,0.011871,0.015197,-0.012643,-0.018938,-0.016052,-0.024723,-0.010149,0.008498,0.003383,0.018771,0.011449,-0.010661,-0.010147,-0.000345,-0.003963,-0.000539,-0.006998,0.004451,0.02374,0.004939,0.014354,0.015748,0.002452
4,0.003281,0.005234,-0.002536,-0.003275,0.00741,0.00423,0.008439,0.005395,-0.009031,-0.008834,-0.013562,-0.012004,0.006485,0.002476,-0.001625,-0.026692,0.001029,0.013904,-0.006572,0.017125,0.00202,0.007533,0.008867,0.006589,-0.024967,0.015246,-0.001807,-0.003189,0.021168,0.002135,-0.02763,0.00498,-0.021051,-0.015327,-0.008264,0.019489,-0.012896,-0.012032,0.003494,0.035856,...,0.002802,0.00574,-0.01335,-0.001041,-0.007351,0.003938,-0.023038,0.01173,-0.024634,-0.017856,-0.007047,-0.010534,0.000671,-0.003451,-0.000555,0.000183,0.016756,0.012669,0.016412,-0.013183,-0.019788,-0.016993,-0.027254,-0.011061,0.009204,0.004076,0.020267,0.012145,-0.010773,-0.011372,-0.000909,-0.004345,-0.000599,-0.0077,0.004205,0.025138,0.005299,0.015281,0.016922,0.002636
