# Text Vectorization and Feature Engineering Assignment

In [0]:
import pandas as pd
import requests
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from bs4 import BeautifulSoup
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [0]:
articles = ['http://lite.cnn.io/en/article/h_eac18760a7a7f9a1bf33616f1c4a336d',
            'http://lite.cnn.io/en/article/h_de3f82f17d289680dd2b47c6413ebe7c',
            'http://lite.cnn.io/en/article/h_72f4dc9d6f35458a89af014b62e625ad',
            'http://lite.cnn.io/en/article/h_aa21fe6bf176071cb49e09d422c3adf0',
            'http://lite.cnn.io/en/article/h_8ad34a532921c9076cdc9d7390d2f1bc',
            'http://lite.cnn.io/en/article/h_84422c79110d9989177cfaf1c5f45fe7',
            'http://lite.cnn.io/en/article/h_d010d9580abac3a44c6181ec6fb63d58',
            'http://lite.cnn.io/en/article/h_fb11f4e9d7c5323e75b337d9e9e5e368',
            'http://lite.cnn.io/en/article/h_7b27f0b131067f8ece6238ac559670ab',
            'http://lite.cnn.io/en/article/h_8cae7f735fa9573d470f802063ceffe2',
            'http://lite.cnn.io/en/article/h_72c3668280e82576fcc2602b0fa70c14',
            'http://lite.cnn.io/en/article/h_d20658fb0e20212051cda0e0a7248c8a',
            'http://lite.cnn.io/en/article/h_56611c43d7928120d2ae21666ccc7417',
            'http://lite.cnn.io/en/article/h_bda0394e3c5ee7054ee65c022bca7695']

In [0]:
TAGS = ['p']
data = []

for article in articles:
    response = requests.get(article)
    content = response.text
    soup = BeautifulSoup(content)
    text_list = [tag.get_text() for tag in soup.find_all(TAGS)]
    text = ' '.join(text_list)
    data.append(text)


In [0]:
!mkdir news_articles

for i, d in enumerate(data):
    file = open(f'./news_articles/article_{i}.txt', 'w')
    file.write(data[i])
    file.close()

In [0]:
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
import nltk
nltk.download('punkt')
DOC_PATTERN = r'.*\.txt'
CAT_PATTERN = r'([\w_\s]+/.*)'

corpus = PlaintextCorpusReader('news_articles', DOC_PATTERN)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [0]:
doc_list = []
for fileid in corpus.fileids():
    doc = corpus.raw(fileid)
    if len(doc)>0:
        doc_list.append(doc)

doc_list

['(CNN) - Pink has been working pretty hard and it sounds like she will be taking a step back in 2020. Speaking with "Entertainment Tonight" on the Country Music Association Awards red carpet, the singer was joined by her husband, Carey Hart, and their kids Willow, 8, and Jameson, 2. Pink was there to perform her song "Love Me Anyway" with country star Chris Stapleton, and she  talked about how hectic things have been.  "We did two and a half years of [music] and Willow\'s back in school now, Jameson\'s going to start pre-school soon," Pink said. "It\'s kind of the year of the family." The star also praised her husband, with whom she will celebrate 14 years of marriage in January. "Carey has a lot going on as well," she said of Hart, who went from being a professional motocross competitor to racing off-road trucks. "He\'s super supportive, he follows me around the world and now it\'s his turn." According to Billboard, Pink\'s Beautiful Trauma Tour ranks as the 10th highest-grossing tou

In [0]:
df = pd.DataFrame( doc_list, columns= ['text'])

In [0]:
df.head()

Unnamed: 0,text,word_tokens
0,(CNN) - Pink has been working pretty hard and ...,"[(, CNN, ), -, Pink, has, been, working, prett..."
1,(CNN) - Former Massachusetts Gov. Deval Patric...,"[(, CNN, ), -, Former, Massachusetts, Gov, ., ..."
2,(CNN) - There's a 10-week-old puppy in Missour...,"[(, CNN, ), -, There, 's, a, 10-week-old, pupp..."
3,(CNN) - Three Democratic heavyweights this wee...,"[(, CNN, ), -, Three, Democratic, heavyweights..."
4,(CNN) - The House Intelligence Committee opene...,"[(, CNN, ), -, The, House, Intelligence, Commi..."


In [0]:
df['word_tokens'] = df['text'].apply(lambda x : word_tokenize(x))

### Preprocess and clean the documents according to the steps below.

- Word Tokenize
- Lowercase
- Remove Stopwords
- Remove Punctuation
- Lemmatize
- Stem

In [0]:
import spacy

In [0]:
nlp = spacy.load('en')


In [0]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
token_list=[]
for doc in doc_list:
    temp_list =[]
    words = word_tokenize(doc)
    for word in words:
        if word.lower() not in stopwords.words('english') and word.lower().isalpha():
            temp_list.append(word.lower())
    token_list.append(temp_list)


In [0]:
lemmatizer = WordNetLemmatizer()
lemmatized = [[lemmatizer.lemmatize(token) for token in doc] for doc in token_list]

In [0]:
lemmatized [:5]

In [0]:
stemmer  = SnowballStemmer('english')
stemmed = [[stemmer.stem(token) for token in doc] for doc in lemmatized]

In [0]:
stemmed[0]

In [0]:
docs_list = [' '.join(doc) for doc in stemmed]

In [0]:
docs_list

['cnn pink work pretti hard sound like take step back speak entertain tonight countri music associ award red carpet singer join husband carey hart kid willow jameson pink perform song love anyway countri star chris stapleton talk hectic thing two half year music willow back school jameson go start soon pink said kind year famili star also prais husband celebr year marriag januari carey lot go well said hart went profession motocross competitor race truck super support follow around world turn accord billboard pink beauti trauma tour rank tour time earn million cabl news network turner broadcast system right reserv listen cnn usag go full cnn experi',
 'cnn former massachusett gov deval patrick told friend alli phone call wednesday made decis run presid two peopl familiar matter say plan time formal announc still flux wednesday attent impeach sourc familiar plan tell cnn patrick offici file new hampshir primari thursday concord appear cbs morn odd could incred steep late entrant like pa

### Count vectorize the preprocessed documents.

In [0]:
c_vect = CountVectorizer()
c_vects = c_vect.fit_transform(docs_list)

In [0]:
count = pd.DataFrame(c_vects.toarray(), columns= c_vect.get_feature_names(
))

In [0]:
count

Unnamed: 0,abc,abet,abil,abl,abort,absolut,abund,abus,accent,accept,accid,accomplic,accomplish,accord,account,accus,acknowledg,across,act,action,activ,activist,actor,actual,ad,adam,add,addit,address,administr,admir,admiss,adopt,ador,advanc,advantag,advic,advis,aerial,affair,...,wind,window,wit,withheld,withhold,within,without,wolfson,woman,wonder,wood,word,work,worker,world,worm,worri,wors,worst,would,wound,wow,wrap,write,wrong,wrongdo,wrote,xinhua,ya,yawn,year,yell,yes,yesterday,yet,york,yorker,young,younger,zelenski
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,3,0,0,0,0,0,0,2,0,0,0,6,0,0,0,0,2,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,2,0,0,0,0,1,0,0,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,2,0,0,1,0
4,1,0,1,0,0,0,0,2,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,3,2,0,0,1,2,0,0,0,0,1,0,0,2,0,0,...,0,0,5,3,1,1,1,0,0,0,0,0,5,0,0,0,0,0,0,0,0,2,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,7
5,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,2,2,0,3,0,1,0,0,2,0,0,0,4,1,0,0,0,0,0,4,1,4,0,0,1,0,1,0,0
6,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,2,0,0,1,0,1,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,8,0,0,0,0,0,0,0,0,2,1,0,0,0,1,0,1,1,0,1,2,3,1,0,1,0,0,0,2,0,0,0,0,1,1,0,0,0
7,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,6,0,0,0,0,0,0,0,0,0


In [0]:
for col in count:
    print(count[col].value_counts())

### One hot vectorize the preprocessed documents.

In [0]:
onehot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6359 entries, 0 to 6358
Columns: 2621 entries, abc to zelensky
dtypes: int64(2621)
memory usage: 127.2 MB


In [0]:
oh_vect = CountVectorizer(binary = True)
oh_vects = oh_vect.fit_transform(no_punct)

onehot = pd.DataFrame(oh_vects.toarray(), columns = oh_vect.get_feature_names())

In [0]:
for col in onehot:
    print(onehot[col].value_counts())

### TF-IDF vectorize the preprocessed documents.

In [0]:
tf_idf = TfidfVectorizer()
tf_vects = tf_idf.fit_transform(docs_list)

tf_idf_df = pd.DataFrame(tf_vects.toarray(), columns= tf_idf.get_feature_names())

In [0]:
for col in tf_idf_df:
    print(tf_idf_df[col].value_counts())

In [0]:
count

Unnamed: 0,abc,abetting,ability,able,aborted,absolutely,abundantly,abuse,abused,accent,accept,accepted,accident,accomplices,accomplished,according,account,accused,accusers,accusing,acknowledged,across,act,acting,actions,actively,activists,activities,activity,actor,actual,actually,adam,add,added,adding,addition,address,administration,administrators,...,wonderfully,wood,woods,word,words,work,workers,working,world,worlds,worms,worried,worry,worse,worst,would,wounded,wow,wrapped,write,writes,writing,wrong,wrongdoing,wrongful,wrote,xinhua,ya,yawn,year,years,yell,yes,yesterday,yet,york,yorker,young,younger,zelensky
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6354,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6355,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6356,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
tf_idf_df

Unnamed: 0,abc,abet,abil,abl,abort,absolut,abund,abus,accent,accept,accid,accomplic,accomplish,accord,account,accus,acknowledg,across,act,action,activ,activist,actor,actual,ad,adam,add,addit,address,administr,admir,admiss,adopt,ador,advanc,advantag,advic,advis,aerial,affair,...,wind,window,wit,withheld,withhold,within,without,wolfson,woman,wonder,wood,word,work,worker,world,worm,worri,wors,worst,would,wound,wow,wrap,write,wrong,wrongdo,wrote,xinhua,ya,yawn,year,yell,yes,yesterday,yet,york,yorker,young,younger,zelenski
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06161,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052823,0.0,0.081176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.170824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047633,0.047633,0.036682,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047739,0.0,0.0,0.0,0.0,0.0,0.0,0.071609,0.0,0.0,0.0,0.0,0.0,0.0,0.073363,0.0,0.0,0.0,0.154384,0.0,0.0,0.0,0.0,0.055681,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033744,0.0,0.0,0.0,0.0,0.0,0.028334,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033744,0.033744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.145432,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.074664,0.0,0.048477,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037332,0.0,0.048477,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.032888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037999,0.0,0.0,0.0,0.0,0.0,0.0,0.029263,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.02645,0.0,0.075997,0.0,0.0,0.0,0.0,0.019042,0.0,0.0,0.0,0.0,0.0,0.0,0.038084,0.0,0.0,0.0,0.0,0.022209,0.0,0.0,0.0,0.0,0.0,0.020526,0.0,0.0,0.0,0.0,0.044419,0.0,0.0,0.037999,0.0
4,0.022658,0.0,0.01961,0.0,0.0,0.0,0.0,0.031543,0.0,0.0,0.022658,0.0,0.0,0.013243,0.0,0.0,0.0,0.0,0.017448,0.0,0.0,0.0,0.0,0.015771,0.047314,0.031543,0.0,0.0,0.022658,0.031543,0.0,0.0,0.0,0.0,0.01961,0.0,0.0,0.034897,0.0,0.0,...,0.0,0.0,0.087242,0.067973,0.01961,0.015771,0.017448,0.0,0.0,0.0,0.0,0.0,0.056771,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039221,0.0,0.0,0.0,0.01961,0.0,0.0,0.0,0.022658,0.0,0.0,0.017448,0.0,0.0,0.013243,0.0,0.0,0.0,0.137273
5,0.0,0.0,0.0,0.024179,0.0,0.0,0.0,0.0,0.0,0.024179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019445,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027936,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027936,0.0,0.048357,0.027998,0.0,0.064539,0.0,0.024179,0.0,0.0,0.027998,0.0,0.0,0.0,0.096715,0.016328,0.0,0.0,0.0,0.0,0.0,0.060362,0.027936,0.086052,0.0,0.0,0.016328,0.0,0.021513,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.021265,0.0,0.017102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042531,0.0,0.0,0.018921,0.0,0.018921,0.042531,0.0,0.0,0.0,0.017102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.151367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042531,0.012312,0.0,0.0,0.0,0.021265,0.0,0.02457,0.012312,0.0,0.021265,0.049139,0.063796,0.01436,0.0,0.018921,0.0,0.0,0.0,0.026544,0.0,0.0,0.0,0.0,0.01436,0.02457,0.0,0.0,0.0
7,0.0,0.0,0.02656,0.02656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023632,0.0,0.0,0.0,0.0,0.0,0.0,0.030687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.030687,0.0,0.0,0.0,0.021361,0.0,0.0,0.0,0.0,0.02656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030687,0.0,0.0,0.0,0.023632,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026509,0.0,0.039255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.063141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.080247,0.0,0.0,0.0,0.0,0.0,0.061798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.080247,0.0,0.260091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Use Doc2Vec to vectorize the preprocessed documents.

Set the size of the vectors to be the same size as those of the other methods using the `vector_size` argument.

In [0]:
doc_list_without_stop[1]

In [0]:
documents = [TaggedDocument(docs_list, [i]) for i, doc in enumerate(docs_list)]

In [0]:
model = Doc2Vec(documents, vector_size = 2621)


In [0]:
doc2vec = pd.DataFrame([[document]+ list(model[document]) for document in range(len(documents))]).drop(0, axis = 1)

In [0]:
documents[0][0][:5]

['cnn pink work pretti hard sound like take step back speak entertain tonight countri music associ award red carpet singer join husband carey hart kid willow jameson pink perform song love anyway countri star chris stapleton talk hectic thing two half year music willow back school jameson go start soon pink said kind year famili star also prais husband celebr year marriag januari carey lot go well said hart went profession motocross competitor race truck super support follow around world turn accord billboard pink beauti trauma tour rank tour time earn million cabl news network turner broadcast system right reserv listen cnn usag go full cnn experi',
 'cnn former massachusett gov deval patrick told friend alli phone call wednesday made decis run presid two peopl familiar matter say plan time formal announc still flux wednesday attent impeach sourc familiar plan tell cnn patrick offici file new hampshir primari thursday concord appear cbs morn odd could incred steep late entrant like pa

In [0]:
doc2vec

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,2582,2583,2584,2585,2586,2587,2588,2589,2590,2591,2592,2593,2594,2595,2596,2597,2598,2599,2600,2601,2602,2603,2604,2605,2606,2607,2608,2609,2610,2611,2612,2613,2614,2615,2616,2617,2618,2619,2620,2621
0,-0.000106,0.000134,9e-06,0.000186,1.9e-05,-2e-06,-4e-06,-9.6e-05,-5.9e-05,-0.00016,-8e-05,-7.3e-05,-8.4e-05,-0.000169,-2.1e-05,9.610671e-05,8.4e-05,0.000185,-0.00015,-0.000181,-5.9e-05,-0.000182,-2.5e-05,0.000172,8.4e-05,4.7e-05,-0.000115,8.1e-05,-8.8e-05,-9.8e-05,0.000144,-0.00018,-1.5e-05,0.00017,3.8e-05,8.4e-05,0.000159,-3.7e-05,0.000111,2.6e-05,...,5.321524e-05,-0.00016,-0.000134,-4.3e-05,9.8e-05,-0.000131,0.000178,-0.000123,1.5e-05,-9.1e-05,4e-05,-0.000187,-7.1e-05,-5.2e-05,-8.5e-05,-1.8e-05,-0.000127,1.5e-05,-0.0001568857,-0.0001195517,4.6e-05,9e-06,4.448841e-05,-0.000185,-0.000142,-7.1e-05,-1.9e-05,-0.000155,-0.000137,-5.1e-05,-1e-05,-2e-06,-0.000105,-0.000181,0.00018,-0.000142,-8.4e-05,0.000125,-0.000114,1.8e-05
1,2.2e-05,5.6e-05,0.000137,-0.000152,0.000189,-1.4e-05,-7e-05,-1.3e-05,-3.6e-05,0.000122,0.000134,7.3e-05,-4e-06,0.000176,4.1e-05,-0.0001543017,0.000182,-7.7e-05,1.9e-05,4.7e-05,-8.6e-05,0.000141,-0.000135,-3.5e-05,0.00017,2.3e-05,-6.5e-05,-9.2e-05,-0.000145,-3.2e-05,-9.2e-05,-1.1e-05,-4.9e-05,2.1e-05,-5.2e-05,2.8e-05,-1.2e-05,-0.000111,0.000155,-8.5e-05,...,-2.28282e-05,5.6e-05,-0.000121,3.5e-05,9.3e-05,-0.000182,-0.000181,6e-06,-0.000111,-0.000163,-0.000127,-0.000173,-0.000159,-0.000179,2.9e-05,-0.0001,7.4e-05,-0.000164,-2.743198e-05,-2.77535e-05,-4.2e-05,0.000157,1.233596e-05,0.000131,6e-06,-4.8e-05,8.6e-05,-0.000157,-0.000149,-0.000175,0.00014,-0.000135,-0.000106,-0.000179,-0.000157,0.000146,0.000121,0.000137,0.000125,-1.6e-05
2,-5.1e-05,-8.9e-05,-6.5e-05,0.000109,-9.6e-05,-0.000172,-0.000114,-4.4e-05,0.000104,7.2e-05,-9.1e-05,0.000112,-0.000183,0.000131,-6.3e-05,-0.0001815133,5.2e-05,0.000163,0.000178,0.00019,-2.4e-05,-8.7e-05,-8.3e-05,-0.00015,-0.000135,7.1e-05,-9.9e-05,0.000141,-0.000106,-5e-06,-1.5e-05,6.7e-05,0.000169,-2.6e-05,-3.4e-05,3.7e-05,-0.000168,0.000158,-5.2e-05,-8.6e-05,...,0.0001631233,-7.2e-05,7.8e-05,0.000184,-0.00011,0.000107,5.4e-05,0.000118,-1.4e-05,0.000117,-0.000146,-0.000107,0.000112,-0.000107,-0.000115,-0.000141,-0.000188,0.000168,0.0001264834,-0.0001490686,7.7e-05,-0.000167,0.0001516706,0.000151,-1.9e-05,-3.1e-05,4e-06,2.6e-05,0.000177,-0.000144,-0.000161,0.000136,8.8e-05,-0.000123,-7.9e-05,-0.000127,-8.6e-05,6e-05,0.000152,-8.6e-05
3,1.5e-05,-4e-06,-1.9e-05,0.000114,-9.1e-05,9.7e-05,-0.000184,0.000132,-0.000144,4.8e-05,-1.8e-05,0.000152,-2.7e-05,-9.2e-05,-0.000168,-0.0001373713,0.000106,4e-06,2.2e-05,5.5e-05,8.4e-05,7.7e-05,-6e-06,-5.6e-05,-0.0001,6.3e-05,-5.6e-05,-2.3e-05,-0.00018,0.000186,0.000101,0.000112,0.000173,0.000109,8.1e-05,-0.000133,0.000177,3.3e-05,-3.8e-05,3e-06,...,0.0001170928,6.2e-05,0.000174,0.000129,-0.000176,0.000103,0.000104,0.000165,-3.6e-05,-9.3e-05,0.000124,0.000118,0.000111,7.6e-05,3.2e-05,-0.000131,-5e-06,0.000157,5.13674e-05,-1.123828e-05,-5.1e-05,3e-05,-2.447929e-05,-7.3e-05,0.000183,0.000161,6.9e-05,4.3e-05,-9.8e-05,0.000105,-0.000119,-0.000173,-0.000134,7.8e-05,0.000182,9.2e-05,0.00013,-0.000171,0.000167,-0.000166
4,3.1e-05,6.8e-05,-0.000128,9.8e-05,-5.4e-05,0.000168,2.8e-05,-9.1e-05,-2.2e-05,-0.000162,0.000169,0.000185,0.000136,-0.000148,0.0001,9.092525e-05,0.000104,0.000178,0.000106,2e-06,0.000105,-6.1e-05,-0.000151,-0.000154,0.000125,-5.7e-05,-8.6e-05,-0.000144,0.000154,-3e-05,1.7e-05,0.000157,0.000159,0.000188,0.000107,0.000177,-0.000178,-0.000163,3e-05,0.000149,...,0.0001830793,0.000179,-0.000138,9.5e-05,-1.4e-05,-0.000146,1.8e-05,-0.000186,-1.7e-05,5.9e-05,-0.000159,0.000174,0.000164,7.7e-05,-0.000167,-0.000164,-7.8e-05,-0.000121,-0.0001231853,-2.331628e-05,-7.7e-05,8.8e-05,8.804194e-05,-0.00012,0.000145,1e-06,-3.7e-05,0.000122,5e-06,3.1e-05,8e-06,8.5e-05,-7.6e-05,1.4e-05,-0.000107,-8.8e-05,-4.7e-05,0.000143,-6.9e-05,0.000183
5,0.000168,-0.000117,-7.3e-05,0.000115,-0.000145,4e-06,-0.000113,-8e-06,0.000188,0.000157,0.000138,-0.000136,-9.9e-05,0.000142,9.4e-05,4.046728e-07,0.000116,-5.1e-05,-2.5e-05,-0.000111,-0.000135,-0.000145,-6.1e-05,-6.4e-05,-0.000162,7.1e-05,0.000167,0.000169,4.1e-05,-0.000168,5e-06,-1.8e-05,0.000138,6.9e-05,-0.000176,8.4e-05,2.8e-05,8e-06,0.00016,8.2e-05,...,-0.0001576892,-2.6e-05,-0.000144,-0.00017,-6.5e-05,-8.7e-05,0.000109,1.4e-05,-5.4e-05,-3.9e-05,-9e-06,7.3e-05,-0.000127,-3e-05,-7.4e-05,0.000181,-0.000131,0.00015,1.681652e-05,-0.0001423417,-0.000123,8.6e-05,8.733483e-05,-4.4e-05,-3.8e-05,9.2e-05,0.000137,3.1e-05,-8.6e-05,9.9e-05,-5e-06,6e-05,-5.3e-05,6.2e-05,-2.3e-05,5.5e-05,-0.000182,6.3e-05,8.8e-05,3.1e-05
6,0.000151,3.5e-05,7e-05,-1.4e-05,-6.8e-05,0.000103,7.7e-05,-8.5e-05,-6.6e-05,-7.2e-05,-8.2e-05,0.000148,-8.5e-05,7.5e-05,0.000142,0.000122771,-2.1e-05,-0.000175,-9.2e-05,0.000144,1.8e-05,-0.000149,0.000168,-0.000154,8.7e-05,0.000135,-0.000103,0.000133,7.9e-05,-9.1e-05,0.000151,0.000123,2.4e-05,2.1e-05,-0.000178,5.1e-05,-0.000158,0.000108,-0.00016,-7.4e-05,...,0.0001305053,0.000171,-0.000116,7.8e-05,5e-06,0.000126,-0.000122,7e-06,0.000177,0.000164,8.4e-05,4.9e-05,-5.8e-05,8e-05,0.000182,-0.000159,0.000123,4.6e-05,-0.0001424232,-2.429944e-05,0.000126,3.1e-05,-0.0001354906,-0.000139,-1.6e-05,-9e-05,-0.000135,-7.6e-05,0.000167,-2.6e-05,0.000102,-6.9e-05,-6.1e-05,-6.4e-05,0.000176,-7.8e-05,6.5e-05,0.000132,1.5e-05,-0.000166
7,0.000133,-9.7e-05,0.000136,0.000142,-0.00014,9.3e-05,-0.000183,-0.000157,7.7e-05,8.9e-05,-0.000102,0.000139,-0.00018,-0.00011,-0.000111,3.981434e-05,8.4e-05,0.000144,-0.000145,6.9e-05,-0.000144,-0.000144,5e-05,-0.00019,5e-06,9.4e-05,0.000124,-7.2e-05,0.000147,2.4e-05,0.000171,-0.000108,-0.000158,-0.000112,0.000169,0.000152,2.9e-05,0.000142,-8.3e-05,-0.000128,...,0.0001278561,6.2e-05,8.2e-05,5.7e-05,-4.9e-05,-2.5e-05,-0.000173,6e-06,0.000119,0.000112,3.8e-05,5.5e-05,-0.00014,0.000135,-0.000178,-0.000164,3.1e-05,-1.7e-05,-0.0001626841,-0.0001480244,-3.1e-05,0.000112,3.482201e-05,-0.000189,-4.5e-05,-0.000118,-0.000117,0.00013,8e-06,-2.5e-05,-0.000182,-0.000102,9.1e-05,9.9e-05,-0.000102,-0.0001,-4.6e-05,-4.1e-05,-0.000166,7.6e-05
8,-0.000158,-0.000162,-1.7e-05,0.000149,1.1e-05,-3.6e-05,4.1e-05,-6.5e-05,6e-05,-2.1e-05,0.000127,0.000133,0.000145,-0.000142,-0.00011,-4.030462e-05,-1.5e-05,-0.000127,0.00015,-0.00011,9.4e-05,-6.7e-05,0.000153,2.3e-05,4.2e-05,5.7e-05,-0.00017,-0.000163,0.000123,-0.000113,0.000108,-9.5e-05,0.000146,-8.6e-05,-7e-06,-8.5e-05,-3.8e-05,7e-06,6.6e-05,5.7e-05,...,-1.607985e-05,-4.2e-05,-0.00018,0.000122,9e-05,0.00017,0.000156,3.8e-05,-0.000147,-0.000173,-2.7e-05,-0.000111,0.000103,4.7e-05,-0.000136,0.000145,0.000132,0.000185,0.0001663082,5.399735e-05,-7.5e-05,-2.3e-05,-0.0001018649,-0.000155,0.000146,-0.000173,-5e-05,1e-06,8.5e-05,-0.000102,0.000187,-7.3e-05,0.000177,0.000109,-0.000173,7.9e-05,-0.000186,2.4e-05,-0.000133,-8.2e-05
9,-3.3e-05,0.000142,-0.000173,-0.000127,0.000181,-0.000116,8.4e-05,-0.000135,3.7e-05,0.000152,3.3e-05,0.000106,5e-06,-4.8e-05,-8.1e-05,0.0001071592,-9.9e-05,-6.7e-05,0.000171,2e-05,1.7e-05,-0.000129,7.7e-05,0.000186,0.000188,-0.00011,7.3e-05,-4.1e-05,8.4e-05,7.2e-05,-0.000153,0.000163,-1.3e-05,-0.00015,-5.3e-05,-3e-05,0.000157,-0.000133,-3.8e-05,0.000145,...,-0.0001068027,4.9e-05,3.3e-05,8.6e-05,-0.000135,4.1e-05,-9.1e-05,-0.000158,-0.000125,-0.000143,2e-06,-5e-05,7.8e-05,7.9e-05,0.000159,-0.00012,-0.000126,0.000106,0.000170625,-0.0001546233,-6.4e-05,-5e-05,0.0001564492,-0.000118,-9.9e-05,5.7e-05,-0.000159,-8e-06,0.000184,-0.000107,0.000114,6.4e-05,-4.1e-05,0.000123,-0.000136,-4.8e-05,0.000145,2.5e-05,-0.000158,5.3e-05
