# Text Vectorization and Feature Engineering Assignment

In [1]:
import pandas as pd
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/abilenky/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [2]:
path = 'cnn_articles/'
doc_pattern = r'.*\.txt'
corpus = PlaintextCorpusReader(path, doc_pattern)

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [3]:
docs = [corpus.raw(id_) for id_ in corpus.fileids()]
docs[0]

'Pink taking a break to focus on familyUpdated 10:18 AM ET, Thu November 14, 2019(CNN) - Pink has been working pretty hard and it sounds like she will be taking a step back in 2020.Speaking with "Entertainment Tonight" on the Country Music Association Awards red carpet, the singer was joined by her husband, Carey Hart, and their kids Willow, 8, and Jameson, 2.Pink was there to perform her song "Love Me Anyway" with country star Chris Stapleton, and she  talked about how hectic things have been. "We did two and a half years of [music] and Willow\'s back in school now, Jameson\'s going to start pre-school soon," Pink said. "It\'s kind of the year of the family."The star also praised her husband, with whom she will celebrate 14 years of marriage in January."Carey has a lot going on as well," she said of Hart, who went from being a professional motocross competitor to racing off-road trucks. "He\'s super supportive, he follows me around the world and now it\'s his turn."According to Billbo

### Preprocess and clean the documents according to the steps below.

- Word Tokenize
- Lowercase
- Remove Stopwords
- Remove Punctuation
- Lemmatize
- Stem

In [4]:
# Word Tokenize
word_tokenized = [word_tokenize(doc) for doc in docs]
print(word_tokenized[0])

['Pink', 'taking', 'a', 'break', 'to', 'focus', 'on', 'familyUpdated', '10:18', 'AM', 'ET', ',', 'Thu', 'November', '14', ',', '2019', '(', 'CNN', ')', '-', 'Pink', 'has', 'been', 'working', 'pretty', 'hard', 'and', 'it', 'sounds', 'like', 'she', 'will', 'be', 'taking', 'a', 'step', 'back', 'in', '2020.Speaking', 'with', '``', 'Entertainment', 'Tonight', "''", 'on', 'the', 'Country', 'Music', 'Association', 'Awards', 'red', 'carpet', ',', 'the', 'singer', 'was', 'joined', 'by', 'her', 'husband', ',', 'Carey', 'Hart', ',', 'and', 'their', 'kids', 'Willow', ',', '8', ',', 'and', 'Jameson', ',', '2.Pink', 'was', 'there', 'to', 'perform', 'her', 'song', '``', 'Love', 'Me', 'Anyway', "''", 'with', 'country', 'star', 'Chris', 'Stapleton', ',', 'and', 'she', 'talked', 'about', 'how', 'hectic', 'things', 'have', 'been', '.', '``', 'We', 'did', 'two', 'and', 'a', 'half', 'years', 'of', '[', 'music', ']', 'and', 'Willow', "'s", 'back', 'in', 'school', 'now', ',', 'Jameson', "'s", 'going', 'to', 

In [5]:
# Lowercase and remove Stopwords
lowercased = [[token.lower() for token in doc if token.lower() not in stopwords.words('english')] for doc in word_tokenized]
print(lowercased[0])

['pink', 'taking', 'break', 'focus', 'familyupdated', '10:18', 'et', ',', 'thu', 'november', '14', ',', '2019', '(', 'cnn', ')', '-', 'pink', 'working', 'pretty', 'hard', 'sounds', 'like', 'taking', 'step', 'back', '2020.speaking', '``', 'entertainment', 'tonight', "''", 'country', 'music', 'association', 'awards', 'red', 'carpet', ',', 'singer', 'joined', 'husband', ',', 'carey', 'hart', ',', 'kids', 'willow', ',', '8', ',', 'jameson', ',', '2.pink', 'perform', 'song', '``', 'love', 'anyway', "''", 'country', 'star', 'chris', 'stapleton', ',', 'talked', 'hectic', 'things', '.', '``', 'two', 'half', 'years', '[', 'music', ']', 'willow', "'s", 'back', 'school', ',', 'jameson', "'s", 'going', 'start', 'pre-school', 'soon', ',', "''", 'pink', 'said', '.', '``', "'s", 'kind', 'year', 'family', '.', '``', 'star', 'also', 'praised', 'husband', ',', 'celebrate', '14', 'years', 'marriage', 'january', '.', '``', 'carey', 'lot', 'going', 'well', ',', "''", 'said', 'hart', ',', 'went', 'professio

In [6]:
# Remove Punctuation
no_punct = [[token for token in doc if token.isalpha()] for doc in lowercased]
print(no_punct[0])

['pink', 'taking', 'break', 'focus', 'familyupdated', 'et', 'thu', 'november', 'cnn', 'pink', 'working', 'pretty', 'hard', 'sounds', 'like', 'taking', 'step', 'back', 'entertainment', 'tonight', 'country', 'music', 'association', 'awards', 'red', 'carpet', 'singer', 'joined', 'husband', 'carey', 'hart', 'kids', 'willow', 'jameson', 'perform', 'song', 'love', 'anyway', 'country', 'star', 'chris', 'stapleton', 'talked', 'hectic', 'things', 'two', 'half', 'years', 'music', 'willow', 'back', 'school', 'jameson', 'going', 'start', 'soon', 'pink', 'said', 'kind', 'year', 'family', 'star', 'also', 'praised', 'husband', 'celebrate', 'years', 'marriage', 'january', 'carey', 'lot', 'going', 'well', 'said', 'hart', 'went', 'professional', 'motocross', 'competitor', 'racing', 'trucks', 'super', 'supportive', 'follows', 'around', 'world', 'turn', 'according', 'billboard', 'pink', 'beautiful', 'trauma', 'tour', 'ranks', 'tour', 'time', 'earning', 'million']


In [7]:
# Lemmatize
lemmatizer = WordNetLemmatizer()

lemmatized = [[lemmatizer.lemmatize(token) for token in doc] for doc in no_punct]
print(lemmatized[0])

['pink', 'taking', 'break', 'focus', 'familyupdated', 'et', 'thu', 'november', 'cnn', 'pink', 'working', 'pretty', 'hard', 'sound', 'like', 'taking', 'step', 'back', 'entertainment', 'tonight', 'country', 'music', 'association', 'award', 'red', 'carpet', 'singer', 'joined', 'husband', 'carey', 'hart', 'kid', 'willow', 'jameson', 'perform', 'song', 'love', 'anyway', 'country', 'star', 'chris', 'stapleton', 'talked', 'hectic', 'thing', 'two', 'half', 'year', 'music', 'willow', 'back', 'school', 'jameson', 'going', 'start', 'soon', 'pink', 'said', 'kind', 'year', 'family', 'star', 'also', 'praised', 'husband', 'celebrate', 'year', 'marriage', 'january', 'carey', 'lot', 'going', 'well', 'said', 'hart', 'went', 'professional', 'motocross', 'competitor', 'racing', 'truck', 'super', 'supportive', 'follows', 'around', 'world', 'turn', 'according', 'billboard', 'pink', 'beautiful', 'trauma', 'tour', 'rank', 'tour', 'time', 'earning', 'million']


In [8]:
# Stem
stemmer = SnowballStemmer('english')
stemmed = [[stemmer.stem(token) for token in doc] for doc in lemmatized]
print(stemmed[0])

['pink', 'take', 'break', 'focus', 'familyupd', 'et', 'thu', 'novemb', 'cnn', 'pink', 'work', 'pretti', 'hard', 'sound', 'like', 'take', 'step', 'back', 'entertain', 'tonight', 'countri', 'music', 'associ', 'award', 'red', 'carpet', 'singer', 'join', 'husband', 'carey', 'hart', 'kid', 'willow', 'jameson', 'perform', 'song', 'love', 'anyway', 'countri', 'star', 'chris', 'stapleton', 'talk', 'hectic', 'thing', 'two', 'half', 'year', 'music', 'willow', 'back', 'school', 'jameson', 'go', 'start', 'soon', 'pink', 'said', 'kind', 'year', 'famili', 'star', 'also', 'prais', 'husband', 'celebr', 'year', 'marriag', 'januari', 'carey', 'lot', 'go', 'well', 'said', 'hart', 'went', 'profession', 'motocross', 'competitor', 'race', 'truck', 'super', 'support', 'follow', 'around', 'world', 'turn', 'accord', 'billboard', 'pink', 'beauti', 'trauma', 'tour', 'rank', 'tour', 'time', 'earn', 'million']


### Count vectorize the preprocessed documents.

In [9]:
processed_docs = [' '.join(word for word in doc) for doc in stemmed]
processed_docs[0]

'pink take break focus familyupd et thu novemb cnn pink work pretti hard sound like take step back entertain tonight countri music associ award red carpet singer join husband carey hart kid willow jameson perform song love anyway countri star chris stapleton talk hectic thing two half year music willow back school jameson go start soon pink said kind year famili star also prais husband celebr year marriag januari carey lot go well said hart went profession motocross competitor race truck super support follow around world turn accord billboard pink beauti trauma tour rank tour time earn million'

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(processed_docs)
count_vectorized = pd.DataFrame(vectors.toarray() , columns=vectorizer.get_feature_names())
count_vectorized

Unnamed: 0,abc,abet,abil,abl,abort,absolut,abund,abus,accent,accept,...,year,yell,yes,yesterday,yet,york,yorker,young,younger,zelenski
0,0,0,0,0,0,0,0,0,0,0,...,3,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,5,0,0,0,0,2,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,2,0,0,1,0
4,1,0,1,0,0,0,0,2,0,0,...,0,0,1,0,0,1,0,0,0,7
5,0,0,0,1,0,0,0,0,0,1,...,3,1,4,0,0,1,0,1,0,0
6,0,0,0,0,0,1,0,1,0,0,...,2,0,0,0,0,1,1,0,0,0
7,0,0,1,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
8,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,5,0,0,0,0,0,0,0,0,0


### One hot vectorize the preprocessed documents.

In [11]:
vectorizer = CountVectorizer(binary=True)
vectors = vectorizer.fit_transform(processed_docs)
one_hot = pd.DataFrame(vectors.toarray() , columns=vectorizer.get_feature_names())
one_hot

Unnamed: 0,abc,abet,abil,abl,abort,absolut,abund,abus,accent,accept,...,year,yell,yes,yesterday,yet,york,yorker,young,younger,zelenski
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,1,0,0,1,0
4,1,0,1,0,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,1
5,0,0,0,1,0,0,0,0,0,1,...,1,1,1,0,0,1,0,1,0,0
6,0,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,1,1,0,0,0
7,0,0,1,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
8,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


### TF-IDF vectorize the preprocessed documents.

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(processed_docs)
tfidf = pd.DataFrame(vectors.toarray() , columns=vectorizer.get_feature_names())
tfidf

Unnamed: 0,abc,abet,abil,abl,abort,absolut,abund,abus,accent,accept,...,year,yell,yes,yesterday,yet,york,yorker,young,younger,zelenski
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.170499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.136661,0.0,0.0,0.0,0.0,0.059146,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03384,0.0,0.0,...,0.0,0.0,0.0,0.0,0.048615,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.033159,0.0,0.0,0.0,0.0,...,0.020695,0.0,0.0,0.0,0.0,0.044785,0.0,0.0,0.038312,0.0
4,0.023741,0.0,0.020548,0.0,0.0,0.0,0.0,0.033051,0.0,0.0,...,0.0,0.0,0.020548,0.0,0.0,0.013876,0.0,0.0,0.0,0.143838
5,0.0,0.0,0.0,0.023933,0.0,0.0,0.0,0.0,0.0,0.023933,...,0.04481,0.027651,0.09573,0.0,0.0,0.016162,0.0,0.021294,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.020955,0.0,0.016853,0.0,0.0,...,0.026158,0.0,0.0,0.0,0.0,0.014151,0.024212,0.0,0.0,0.0
7,0.0,0.0,0.026857,0.026857,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.03103,0.0,0.0,0.0,0.023896,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.024116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.194864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
tfidf.shape[1]

2068

### Use Doc2Vec to vectorize the preprocessed documents.

Set the size of the vectors to be the same size as those of the other methods using the `vector_size` argument.

In [14]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(processed_docs)]
documents[0]

TaggedDocument(words='pink take break focus familyupd et thu novemb cnn pink work pretti hard sound like take step back entertain tonight countri music associ award red carpet singer join husband carey hart kid willow jameson perform song love anyway countri star chris stapleton talk hectic thing two half year music willow back school jameson go start soon pink said kind year famili star also prais husband celebr year marriag januari carey lot go well said hart went profession motocross competitor race truck super support follow around world turn accord billboard pink beauti trauma tour rank tour time earn million', tags=[0])

In [26]:
model = Doc2Vec(documents, vector_size=tfidf.shape[1])
vectors = [model[i] for i in range(len(docs))]
doc2vec = pd.DataFrame(vectors)
doc2vec

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2058,2059,2060,2061,2062,2063,2064,2065,2066,2067
0,-0.00603,-0.002139,0.010496,0.004134,-0.004011,0.003866,-0.008996,-0.012206,0.017337,-0.003323,...,-0.01069,-0.005465,-0.004358,-0.012839,0.006479,0.010248,-0.007246,-0.001741,0.026621,0.004002
1,-0.014624,-0.005247,0.025153,0.009444,-0.009909,0.009191,-0.021241,-0.029366,0.041574,-0.008319,...,-0.025784,-0.012903,-0.010905,-0.030099,0.015216,0.024107,-0.016978,-0.003683,0.063049,0.009053
2,-0.015242,-0.005612,0.026943,0.010218,-0.010476,0.009615,-0.022662,-0.031465,0.044346,-0.008591,...,-0.027412,-0.013768,-0.011678,-0.031568,0.016079,0.025544,-0.018013,-0.004013,0.066643,0.009616
3,-0.014564,-0.005318,0.025138,0.009458,-0.009792,0.009276,-0.021441,-0.029555,0.042374,-0.008033,...,-0.025434,-0.01303,-0.010658,-0.029607,0.015116,0.023469,-0.016377,-0.003963,0.062003,0.009033
4,-0.014562,-0.005458,0.02536,0.009902,-0.00969,0.009071,-0.021646,-0.029479,0.04248,-0.008469,...,-0.025953,-0.013221,-0.011149,-0.030302,0.01563,0.0247,-0.017174,-0.003722,0.063883,0.009753
5,-0.015485,-0.005543,0.02663,0.010198,-0.010288,0.00976,-0.022379,-0.030788,0.043752,-0.008679,...,-0.025763,-0.013015,-0.010981,-0.029805,0.015375,0.024101,-0.01674,-0.003595,0.06302,0.009443
6,-0.015451,-0.005794,0.02681,0.010369,-0.010637,0.009748,-0.022801,-0.031499,0.044871,-0.008663,...,-0.027598,-0.014343,-0.011763,-0.032359,0.016446,0.026044,-0.018113,-0.004168,0.067715,0.00979
7,-0.013391,-0.00514,0.023272,0.009129,-0.009318,0.008548,-0.019808,-0.027514,0.038635,-0.007738,...,-0.024457,-0.012637,-0.010498,-0.028417,0.014624,0.022753,-0.01625,-0.003561,0.060137,0.008529
8,-0.010676,-0.003903,0.018454,0.007337,-0.007259,0.006564,-0.015989,-0.021714,0.0314,-0.006003,...,-0.019162,-0.009812,-0.008204,-0.022141,0.011395,0.018126,-0.01271,-0.002649,0.046959,0.006903
9,-0.009826,-0.003465,0.017017,0.006308,-0.006433,0.006044,-0.014315,-0.020062,0.028451,-0.005454,...,-0.016991,-0.00873,-0.00741,-0.019699,0.010089,0.015956,-0.011224,-0.002386,0.041984,0.005927
