# Text Vectorization and Feature Engineering Assignment

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [19]:
import pandas as pd
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [6]:
PATH = '/content/drive/MyDrive/Thinkful/NLP/cnn_articles/'

DOC_PATTERN = r'.*\.txt'

corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [7]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]

In [8]:
docs

['Pink taking a break to focus on familyUpdated 10:18 AM ET, Thu November 14, 2019(CNN) - Pink has been working pretty hard and it sounds like she will be taking a step back in 2020.Speaking with "Entertainment Tonight" on the Country Music Association Awards red carpet, the singer was joined by her husband, Carey Hart, and their kids Willow, 8, and Jameson, 2.Pink was there to perform her song "Love Me Anyway" with country star Chris Stapleton, and she  talked about how hectic things have been. "We did two and a half years of [music] and Willow\'s back in school now, Jameson\'s going to start pre-school soon," Pink said. "It\'s kind of the year of the family."The star also praised her husband, with whom she will celebrate 14 years of marriage in January."Carey has a lot going on as well," she said of Hart, who went from being a professional motocross competitor to racing off-road trucks. "He\'s super supportive, he follows me around the world and now it\'s his turn."According to Billb

### Preprocess and clean the documents according to the steps below.

- Word Tokenize
- Lowercase
- Remove Stopwords
- Remove Punctuation
- Lemmatize
- Stem

In [12]:
tokenized = [word_tokenize(doc) for doc in docs]
print(tokenized)



In [14]:
lowercase = [[token.lower() for token in doc] for doc in tokenized]
print(lowercase)



In [16]:
no_stopwords = [[token for token in doc if token not in stopwords.words('english')] for doc in lowercase]
print(no_stopwords)



In [18]:
no_punct = [[token for token in doc if token.isalpha() == True] for doc in no_stopwords]
print(no_punct)



In [21]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()
lemmatized = [[lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in doc] for doc in no_punct]
print(lemmatized)



In [22]:
stemmer = SnowballStemmer('english')
stemmed = [[stemmer.stem(token) for token in doc] for doc in lemmatized]
print(stemmed)

[['pink', 'take', 'break', 'focus', 'familyupd', 'et', 'thu', 'novemb', 'cnn', 'pink', 'work', 'pretti', 'hard', 'sound', 'like', 'take', 'step', 'back', 'entertain', 'tonight', 'countri', 'music', 'associ', 'award', 'red', 'carpet', 'singer', 'join', 'husband', 'carey', 'hart', 'kid', 'willow', 'jameson', 'perform', 'song', 'love', 'anyway', 'countri', 'star', 'chris', 'stapleton', 'talk', 'hectic', 'thing', 'two', 'half', 'year', 'music', 'willow', 'back', 'school', 'jameson', 'go', 'start', 'soon', 'pink', 'say', 'kind', 'year', 'famili', 'star', 'also', 'prais', 'husband', 'celebr', 'year', 'marriag', 'januari', 'carey', 'lot', 'go', 'well', 'say', 'hart', 'go', 'profession', 'motocross', 'competitor', 'race', 'truck', 'super', 'support', 'follow', 'around', 'world', 'turn', 'accord', 'billboard', 'pink', 'beauti', 'trauma', 'tour', 'rank', 'tour', 'time', 'earn', 'million'], ['deval', 'patrick', 'tell', 'alli', 'run', 'presidentupd', 'pm', 'et', 'wed', 'novemb', 'cnn', 'former', '

In [29]:
preprocessed_docs = [' '.join([token for token in doc]) for doc in stemmed]
print(preprocessed_docs)
len(preprocessed_docs)

['pink take break focus familyupd et thu novemb cnn pink work pretti hard sound like take step back entertain tonight countri music associ award red carpet singer join husband carey hart kid willow jameson perform song love anyway countri star chris stapleton talk hectic thing two half year music willow back school jameson go start soon pink say kind year famili star also prais husband celebr year marriag januari carey lot go well say hart go profession motocross competitor race truck super support follow around world turn accord billboard pink beauti trauma tour rank tour time earn million', 'deval patrick tell alli run presidentupd pm et wed novemb cnn former massachusett gov deval patrick told friend alli phone call wednesday make decis run presid two peopl familiar matter time formal announc still flux wednesday attent impeach sourc familiar plan tell cnn patrick offici file new hampshir primari thursday concord appear cbs morn odd could incred steep late entrant like patrick well 

14

### Count vectorize the preprocessed documents.

In [30]:
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(preprocessed_docs)
count = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())
count.head()

Unnamed: 0,abc,abet,abil,abl,abort,absolut,abund,abus,accent,accept,accid,accomplic,accomplish,accord,account,accus,acknowledg,across,act,action,activ,activist,actor,actual,adam,add,addit,address,administr,admir,admiss,adopt,ador,advanc,advantag,advic,advis,aerial,affair,affect,...,william,willow,win,wind,window,wit,withheld,withhold,within,without,wolfson,woman,wonder,wood,word,work,worker,world,worm,worri,would,wound,wow,wowtaylor,wrap,write,wrong,wrongdo,xinhua,ya,yawnther,year,yell,yes,yesterday,yet,york,yorker,young,zelenski
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,3,0,0,0,0,1,0,0,0,0,0,5,0,0,0,0,2,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0,2,0,0,0,0,1,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,2,0,1,0
4,1,0,1,0,0,0,0,2,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,1,3,0,1,2,0,0,0,0,1,0,0,2,0,0,1,...,0,0,0,0,0,5,3,1,1,1,0,0,0,0,0,4,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,7


### One hot vectorize the preprocessed documents.

In [31]:
vectorizer = CountVectorizer(binary=True)
vectors = vectorizer.fit_transform(preprocessed_docs)
one_hot = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())
one_hot.head()

Unnamed: 0,abc,abet,abil,abl,abort,absolut,abund,abus,accent,accept,accid,accomplic,accomplish,accord,account,accus,acknowledg,across,act,action,activ,activist,actor,actual,adam,add,addit,address,administr,admir,admiss,adopt,ador,advanc,advantag,advic,advis,aerial,affair,affect,...,william,willow,win,wind,window,wit,withheld,withhold,within,without,wolfson,woman,wonder,wood,word,work,worker,world,worm,worri,would,wound,wow,wowtaylor,wrap,write,wrong,wrongdo,xinhua,ya,yawnther,year,yell,yes,yesterday,yet,york,yorker,young,zelenski
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0
4,1,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,1,1,0,1,1,0,0,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1


### TF-IDF vectorize the preprocessed documents.

In [33]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(preprocessed_docs)

tfidf = pd.DataFrame(vectors.toarray(), columns= vectorizer.get_feature_names())
tfidf.head()

Unnamed: 0,abc,abet,abil,abl,abort,absolut,abund,abus,accent,accept,accid,accomplic,accomplish,accord,account,accus,acknowledg,across,act,action,activ,activist,actor,actual,adam,add,addit,address,administr,admir,admiss,adopt,ador,advanc,advantag,advic,advis,aerial,affair,affect,...,william,willow,win,wind,window,wit,withheld,withhold,within,without,wolfson,woman,wonder,wood,word,work,worker,world,worm,worri,would,wound,wow,wowtaylor,wrap,write,wrong,wrongdo,xinhua,ya,yawnther,year,yell,yes,yesterday,yet,york,yorker,young,zelenski
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.211264,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052934,0.0,0.081347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.171183,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050668,0.050668,0.039019,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050782,0.0,0.0,0.0,0.0,0.076173,0.0,0.0,0.0,0.0,0.039019,0.0,0.0,0.0,0.0,0.0,0.136852,0.0,0.0,0.0,0.0,0.059229,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033871,0.0,0.0,0.0,0.0,0.0,0.02844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033871,0.0,0.030928,0.0,0.0,0.0,0.0,0.0,0.145979,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037472,0.0,0.04866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04866,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.033055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038192,0.0,0.0,0.0,0.0,0.0,0.0,0.029411,0.0,0.0,0.0,...,0.0,0.0,0.033055,0.0,0.0,0.0,0.0,0.0,0.026584,0.0,0.076383,0.0,0.0,0.0,0.0,0.019139,0.0,0.0,0.0,0.0,0.038277,0.0,0.0,0.0,0.0,0.0,0.022322,0.0,0.0,0.0,0.0,0.020631,0.0,0.0,0.0,0.0,0.044644,0.0,0.026584,0.0
4,0.023713,0.0,0.020524,0.0,0.0,0.0,0.0,0.033012,0.0,0.0,0.023713,0.0,0.0,0.01386,0.0,0.0,0.0,0.0,0.018261,0.0,0.0,0.0,0.0,0.016506,0.016506,0.045216,0.0,0.023713,0.033012,0.0,0.0,0.0,0.0,0.020524,0.0,0.0,0.036522,0.0,0.0,0.020524,...,0.0,0.0,0.0,0.0,0.0,0.091306,0.071139,0.020524,0.016506,0.018261,0.0,0.0,0.0,0.0,0.0,0.047532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023713,0.0,0.0,0.0,0.020524,0.0,0.0,0.023713,0.0,0.0,0.020524,0.0,0.0,0.01386,0.0,0.0,0.143667


### Use Doc2Vec to vectorize the preprocessed documents.

Set the size of the vectors to be the same size as those of the other methods using the `vector_size` argument.

In [34]:
documents = [TaggedDocument(doc, [i]) for i,doc in enumerate(preprocessed_docs)]

model = Doc2Vec(documents)

doc2vec = pd.DataFrame([[document] + list(model[document])
                        for document in range(len(docs))]).drop(0, axis=1)

doc2vec.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100
0,0.080154,-0.066884,0.023471,0.035229,-0.012145,-0.030144,0.07399,-0.073526,0.059658,-0.010232,-0.005224,0.007873,0.003353,0.040999,-0.13256,0.036548,-0.002069,-0.051178,-0.036968,0.032093,-0.001085,0.013546,-0.01531,-0.057544,-0.042992,0.047801,-0.083226,0.061412,-0.047069,0.086171,-0.019865,0.01454,0.018904,0.029958,0.05362,0.092871,0.049194,0.017041,0.028247,0.001139,...,-0.154869,-0.084719,0.067801,-0.11452,-0.025337,-0.054814,0.034126,0.001181,0.015831,0.024089,-0.036553,-0.056545,0.068236,0.057716,0.017075,0.054808,0.029238,-0.072908,0.009543,-0.040914,0.00365,-0.034204,-0.028754,-0.085781,-0.046496,-0.002618,0.013369,0.04472,0.020819,-0.088085,0.035041,-0.013074,-0.044054,0.00524,0.039844,-0.047634,0.010518,-0.068852,0.066159,0.038862
1,0.211371,-0.172444,0.052386,0.093667,-0.032392,-0.084088,0.210333,-0.192949,0.158533,-0.012765,-0.015836,0.028429,0.009914,0.113777,-0.336862,0.095663,0.006985,-0.124515,-0.0849,0.091894,-0.010283,0.037815,-0.028482,-0.146498,-0.104347,0.108464,-0.224387,0.176617,-0.109906,0.214777,-0.061603,0.021528,0.059615,0.079751,0.122786,0.230752,0.133794,0.035989,0.090426,0.013302,...,-0.405548,-0.218594,0.172328,-0.29534,-0.062424,-0.155204,0.097588,-0.012374,0.029345,0.061522,-0.094772,-0.140418,0.169558,0.158039,0.04429,0.134247,0.073054,-0.19247,0.018066,-0.101345,0.02083,-0.082452,-0.073588,-0.213699,-0.108449,-0.018486,0.022448,0.125372,0.045775,-0.221903,0.078674,-0.028554,-0.102053,0.021033,0.106738,-0.137005,0.030167,-0.180136,0.161782,0.096262
2,0.170699,-0.141388,0.040974,0.082489,-0.029976,-0.059679,0.165863,-0.165248,0.131682,-0.017333,-0.01866,0.016988,0.009516,0.091226,-0.280608,0.079392,0.007254,-0.097151,-0.068565,0.068849,-0.008939,0.038831,-0.020932,-0.116623,-0.085892,0.092504,-0.179365,0.141417,-0.098691,0.182807,-0.050909,0.021212,0.041863,0.072732,0.107117,0.193948,0.109356,0.025956,0.074483,0.009402,...,-0.332952,-0.182984,0.143232,-0.241875,-0.057851,-0.127797,0.08316,-0.008282,0.020233,0.041973,-0.080164,-0.119964,0.140296,0.130519,0.033645,0.111676,0.054653,-0.155035,0.015077,-0.08742,0.011086,-0.070992,-0.058597,-0.169397,-0.08859,-0.01296,0.023019,0.101688,0.035202,-0.187657,0.073338,-0.018742,-0.091336,0.018666,0.088031,-0.117432,0.023936,-0.150196,0.129047,0.077404
3,0.181205,-0.146506,0.037797,0.082664,-0.02768,-0.06857,0.172795,-0.161268,0.132497,-0.008438,-0.011682,0.021339,0.006395,0.094189,-0.291559,0.077985,-0.000976,-0.099414,-0.074328,0.076395,-0.006828,0.038375,-0.023784,-0.124483,-0.079868,0.099653,-0.186647,0.149128,-0.095467,0.185864,-0.056159,0.026108,0.043798,0.071218,0.103886,0.195983,0.109822,0.023525,0.073379,0.012619,...,-0.340877,-0.189355,0.143086,-0.254619,-0.057849,-0.134658,0.085244,-0.010962,0.027373,0.045981,-0.081535,-0.128501,0.142453,0.128591,0.033536,0.109204,0.060759,-0.168729,0.020556,-0.091373,0.013824,-0.071103,-0.066722,-0.178184,-0.091156,-0.014758,0.022477,0.105119,0.040617,-0.186916,0.070103,-0.019524,-0.086832,0.021412,0.084403,-0.118329,0.027997,-0.153025,0.141818,0.087153
4,0.186902,-0.152931,0.046288,0.082861,-0.028191,-0.073771,0.174923,-0.177274,0.14453,-0.017945,-0.019508,0.027052,0.006005,0.105324,-0.30103,0.08507,0.005761,-0.109989,-0.083525,0.08002,-0.006125,0.035529,-0.025379,-0.131403,-0.091889,0.101039,-0.197552,0.153573,-0.100398,0.195675,-0.048335,0.020498,0.050518,0.077193,0.116825,0.206557,0.118483,0.027453,0.07463,0.010482,...,-0.358249,-0.19636,0.149744,-0.26413,-0.060996,-0.136791,0.091935,-0.006578,0.026893,0.053843,-0.088326,-0.124533,0.152837,0.135881,0.044544,0.121936,0.058739,-0.166757,0.018196,-0.094367,0.015971,-0.073999,-0.063746,-0.185819,-0.095281,-0.018679,0.025275,0.111207,0.041733,-0.20203,0.080947,-0.024508,-0.100028,0.021478,0.093764,-0.114828,0.021871,-0.151306,0.149096,0.085435


#Lecture Notes

In [4]:
PATH = '/content/drive/MyDrive/Thinkful/NLP/oreily_rss/'

DOC_PATTERN = r'.*\.txt'

corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)

In [5]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]

In [7]:
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(docs)

In [10]:
vectors.toarray().shape

(60, 5186)

In [None]:
vectorizer.get_feature_names()

In [13]:
count = pd.DataFrame(vectors.toarray(), columns= vectorizer.get_feature_names())
count.head()

Unnamed: 0,000,00s,01,10,100,1000,100k,11,12,13,14,15,18,182,185,19,1927,1950s,1953,1970s,1990s,1992,1b,20,2003,2006,2008,2010,2018,2019,2020,2021,2025,21st,22,24,247,25,26,300s,...,worst,worth,worthwhile,would,wouldn,wounds,wrangling,wrap,wrapping,write,writer,writes,writing,written,wrong,wronged,wrote,wsj,xiana,xu,ya,yacc,yanking,yann,year,years,yes,yet,yield,yocto,york,you,your,yourself,yyyy,zero,zipline,zoning,zoom,état
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,6,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,7,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,30,4,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,1,0,0,2,1,1,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,3,0,0,0,0,0,0,0,0


In [14]:
vectorizer = CountVectorizer(binary=True)
vectors = vectorizer.fit_transform(docs)

one_hot = pd.DataFrame(vectors.toarray(), columns= vectorizer.get_feature_names())
one_hot.head()

Unnamed: 0,000,00s,01,10,100,1000,100k,11,12,13,14,15,18,182,185,19,1927,1950s,1953,1970s,1990s,1992,1b,20,2003,2006,2008,2010,2018,2019,2020,2021,2025,21st,22,24,247,25,26,300s,...,worst,worth,worthwhile,would,wouldn,wounds,wrangling,wrap,wrapping,write,writer,writes,writing,written,wrong,wronged,wrote,wsj,xiana,xu,ya,yacc,yanking,yann,year,years,yes,yet,yield,yocto,york,you,your,yourself,yyyy,zero,zipline,zoning,zoom,état
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,1,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0


In [17]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(docs)

tfidf = pd.DataFrame(vectors.toarray(), columns= vectorizer.get_feature_names())
tfidf.head()

Unnamed: 0,000,00s,01,10,100,1000,100k,11,12,13,14,15,18,182,185,19,1927,1950s,1953,1970s,1990s,1992,1b,20,2003,2006,2008,2010,2018,2019,2020,2021,2025,21st,22,24,247,25,26,300s,...,worst,worth,worthwhile,would,wouldn,wounds,wrangling,wrap,wrapping,write,writer,writes,writing,written,wrong,wronged,wrote,wsj,xiana,xu,ya,yacc,yanking,yann,year,years,yes,yet,yield,yocto,york,you,your,yourself,yyyy,zero,zipline,zoning,zoom,état
0,0.044066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104533,0.0,0.0,0.0,0.0,0.0,0.0,0.22467,0.052267,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052267,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.196344,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.01221,0.0,0.0,0.0,0.0,0.0,0.016814,0.0,0.068593,0.0,0.016814,0.011369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012704,0.0,0.0,0.0,0.0,0.214942,0.033858,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.04948,0.0,0.0,0.0,0.072108,0.060794,0.06549,0.060794,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06549,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.028502,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028502,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.031895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.046507,0.0,0.0,0.028502,0.0,0.0,0.022061,0.013031,0.02259,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.119862,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.119862,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.134131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.147218,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.119862,0.0,0.139162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
documents = [TaggedDocument(doc, [i]) for i,doc in enumerate(docs)]

model = Doc2Vec(documents)

doc2vec = pd.DataFrame([[document] + list(model[document])
                        for document in range(len(docs))]).drop(0, axis=1)

In [20]:
doc2vec.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100
0,-0.085335,0.010965,-0.052708,0.003382,0.038814,0.134856,-0.016798,0.067259,-0.021726,-0.146837,-0.016827,-0.08962,0.102059,0.017223,0.015242,0.001981,-0.126979,0.175534,-0.065103,-0.070615,-0.108524,0.013153,0.131796,0.02296,-0.055472,-0.048614,0.131259,0.00639,0.151336,-0.103018,0.132727,-0.017534,0.149384,-0.006355,-0.01763,0.246749,-0.039024,-0.036389,-0.187579,-0.078795,...,-0.011612,-0.02156,-0.020549,0.175193,0.019114,0.04744,-0.134274,-0.083189,-0.107867,-0.055187,-0.054003,0.017222,-0.075043,0.011328,-0.00247,0.019208,-0.021339,-0.042214,-0.029014,-0.020347,-0.059925,-0.017902,-0.068902,-0.030441,0.012978,0.052859,-0.114723,0.055764,0.159252,-0.030366,0.125941,0.042974,-0.070732,0.058759,-0.038922,0.016004,0.023671,-0.081851,0.068077,0.062819
1,-0.153871,-0.028139,-0.041253,-0.040634,0.029185,0.149795,-0.010548,0.010765,-0.037633,-0.164924,-0.051191,-0.069149,0.112375,0.105,-0.010168,0.012071,-0.123457,0.188502,-0.035627,-0.071056,-0.128481,0.029152,0.16937,0.054085,-0.087971,-0.085343,0.168733,-0.014426,0.177666,-0.121019,0.112105,-0.008803,0.141125,0.048943,-0.015398,0.216719,-0.040939,-0.113773,-0.219316,-0.10471,...,-0.049134,-0.042454,-0.133699,0.140308,0.012041,0.069659,-0.083177,-0.016552,-0.06339,-0.047902,-0.060936,-0.054771,-0.045651,-0.015169,0.018462,-0.012921,-0.026891,0.006249,-0.04481,0.068777,-0.069423,-0.012815,-0.094834,-0.010992,0.092704,0.070683,-0.11331,0.03664,0.220528,-0.042374,0.131873,0.013268,-0.086758,0.045782,-0.067085,0.06486,-0.005908,-0.086999,0.070527,-0.02624
2,-0.040862,0.025909,-0.098603,0.045409,0.04195,0.196153,-0.035146,0.169866,-0.032833,-0.177781,-0.071115,-0.170168,0.149613,-0.026155,0.054383,-0.008181,-0.246568,0.270056,-0.105248,-0.083344,-0.180153,0.004662,0.119437,-0.014716,-0.02911,-0.049213,0.147723,0.033991,0.175264,-0.135959,0.224042,-0.063089,0.220578,-0.087887,0.000652,0.413715,-0.032181,0.019955,-0.297327,-0.117707,...,0.006319,0.004604,0.045154,0.304787,0.071659,0.080892,-0.228978,-0.196225,-0.216925,-0.08734,-0.108622,0.062056,-0.130902,0.019678,-0.031376,0.034024,-0.051397,-0.104054,-0.060657,-0.128893,-0.095755,-0.057467,-0.089107,-0.068557,-0.068318,0.112442,-0.184203,0.054302,0.27321,-0.042581,0.194116,0.148698,-0.100301,0.132024,-0.035043,0.016454,0.098334,-0.144706,0.163265,0.165125
3,-0.199609,-0.024813,-0.041283,-0.078391,0.060565,0.189529,-0.016747,0.016269,-0.053518,-0.202197,-0.01022,-0.057976,0.125506,0.133848,-0.00745,0.001637,-0.139692,0.214455,-0.047346,-0.122217,-0.168442,0.062849,0.218542,0.102558,-0.105273,-0.090897,0.205952,-0.007175,0.221631,-0.157991,0.127418,-0.01054,0.177027,0.096499,-0.041842,0.246001,-0.060164,-0.147299,-0.228576,-0.126456,...,-0.039377,-0.053084,-0.158272,0.151593,-0.023011,0.072653,-0.130376,-0.008458,-0.042934,-0.040609,-0.043266,-0.05219,-0.056857,-0.008203,0.019156,-0.014047,-0.009349,0.029374,-0.037913,0.08752,-0.086428,-0.007977,-0.109446,-0.002179,0.148192,0.035723,-0.097763,0.071454,0.210765,-0.039015,0.158189,-0.036567,-0.083382,0.020691,-0.087436,0.048612,-0.038125,-0.085789,0.052924,-0.052445
4,-0.128482,-0.016507,-0.046352,-0.041802,0.042058,0.153299,-0.019674,0.046617,-0.037466,-0.160961,-0.039168,-0.066892,0.104334,0.077032,0.005941,0.005283,-0.143704,0.191463,-0.05298,-0.077822,-0.128448,0.030958,0.154073,0.051449,-0.079261,-0.06999,0.15378,0.00831,0.181147,-0.12151,0.125838,-0.019347,0.157986,0.029369,-0.015115,0.247601,-0.044272,-0.08165,-0.218298,-0.101935,...,-0.023241,-0.034048,-0.10003,0.155201,0.012229,0.067346,-0.119722,-0.040396,-0.076674,-0.052186,-0.053686,-0.024547,-0.054514,-0.011687,0.006918,-0.005168,-0.03183,-0.015057,-0.02991,0.024209,-0.073578,-0.012487,-0.097917,-0.010503,0.070891,0.063388,-0.10434,0.049666,0.203325,-0.045664,0.137916,0.016073,-0.07527,0.0452,-0.048223,0.036769,0.010219,-0.093074,0.072065,0.009098


In [21]:
doc2vec.shape

(60, 100)