In [1]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hamza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## I- Text preprocessing

In [2]:
text = "If I were to ask you what's the different        between \"Word\" and \"word\" \
in terms of understanding _-_-_-_the word, chances are you'd say they're-- the same thing--. To a machine that doesn't *understand.. words, they aren't the same.  One way to circumvent this is to simply ensure that all text is fed to the transformation steps as lowercase text only. This step helps eliminate any redundancy in words. To achieve this, this sample code is more than enough"
print(text)

If I were to ask you what's the different        between "Word" and "word" in terms of understanding _-_-_-_the word, chances are you'd say they're-- the same thing--. To a machine that doesn't *understand.. words, they aren't the same.  One way to circumvent this is to simply ensure that all text is fed to the transformation steps as lowercase text only. This step helps eliminate any redundancy in words. To achieve this, this sample code is more than enough


### 1- Text lowercasing

In [3]:
print(text.lower())

if i were to ask you what's the different        between "word" and "word" in terms of understanding _-_-_-_the word, chances are you'd say they're-- the same thing--. to a machine that doesn't *understand.. words, they aren't the same.  one way to circumvent this is to simply ensure that all text is fed to the transformation steps as lowercase text only. this step helps eliminate any redundancy in words. to achieve this, this sample code is more than enough


### 2- Removing punctuation (Regex matching)

In [4]:
print(string.punctuation)
no_punct_text= re.sub(r'['+string.punctuation+']', '', text.lower())
print(no_punct_text)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
if i were to ask you whats the different        between word and word in terms of understanding the word chances are youd say theyre the same thing to a machine that doesnt understand words they arent the same  one way to circumvent this is to simply ensure that all text is fed to the transformation steps as lowercase text only this step helps eliminate any redundancy in words to achieve this this sample code is more than enough


### 3- Removing extra spaces 

In [5]:
no_extra_space = re.sub(' +',' ',no_punct_text)
print(no_extra_space)

if i were to ask you whats the different between word and word in terms of understanding the word chances are youd say theyre the same thing to a machine that doesnt understand words they arent the same one way to circumvent this is to simply ensure that all text is fed to the transformation steps as lowercase text only this step helps eliminate any redundancy in words to achieve this this sample code is more than enough


### 4- Removing Stop words

In [6]:
stopword = stopwords.words('english')
print(stopword)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [7]:
new_text = " ".join([word for word in str(no_extra_space.lower()).split() if word not in stopword])
print(new_text)

ask whats different word word terms understanding word chances youd say theyre thing machine doesnt understand words arent one way circumvent simply ensure text fed transformation steps lowercase text step helps eliminate redundancy words achieve sample code enough


In [9]:
def final_clean(row):
    row = " ".join([word for word in str(row.lower()).split() if word not in stopword])
    row = re.sub(r'['+string.punctuation+']', '', row)
    no_extra_space = re.sub(' +',' ',row)
    return row

In [12]:
final_text = final_clean(text)
print(final_text)

ask whats different word word terms understanding the word chances say theyre thing machine understand words same one way circumvent simply ensure text fed transformation steps lowercase text only step helps eliminate redundancy words achieve this sample code enough


## II-Text transformation

In [14]:
text_df = pd.read_csv('inaugural_speeches.csv')

text_df['clean_text'] = text_df['text'].apply(lambda row: final_clean(row))
text_df.head()

Unnamed: 0,Name,Inaugural Address,Date,text,clean_text
0,George Washington,First Inaugural Address,"Thursday, April 30, 1789",Fellow-Citizens of the Senate and of the House...,fellowcitizens senate house representatives am...
1,George Washington,Second Inaugural Address,"Monday, March 4, 1793",Fellow Citizens: I AM again called upon by th...,fellow citizens called upon voice country exec...
2,John Adams,Inaugural Address,"Saturday, March 4, 1797","WHEN it was first perceived, in early times, t...",first perceived early times middle course amer...
3,Thomas Jefferson,First Inaugural Address,"Wednesday, March 4, 1801",Friends and Fellow-Citizens: CALLED upon to u...,friends fellowcitizens called upon undertake d...
4,Thomas Jefferson,Second Inaugural Address,"Monday, March 4, 1805","PROCEEDING, fellow-citizens, to that qualifica...",proceeding fellowcitizens qualification consti...


### 1- Word Counting

In [37]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate CountVectorizer
cv = CountVectorizer(min_df=0.1,max_df=0.9,stop_words='english',ngram_range=(1,3)) #Instantiate the Count Vectorizer

# Fit the vectorizer
cv.fit(text_df['clean_text'])
cv_transformed = cv.transform(text_df['clean_text']) #Transform the data into an array of word counts array
cv_array = cv_transformed.toarray() #Create numpy array of the values
# Print feature names
print(cv.get_feature_names()) #Get the list of the counted(relevant) words


['abandon', 'abiding', 'ability', 'able', 'abroad', 'absolute', 'abuse', 'abuses', 'accept', 'accepted', 'accomplish', 'accomplished', 'accomplishment', 'accordance', 'according', 'account', 'accountability', 'achieve', 'achieved', 'achievement', 'achievements', 'acknowledge', 'acknowledged', 'acquiescence', 'act', 'acted', 'acting', 'action', 'actions', 'acts', 'actual', 'adapted', 'add', 'added', 'additional', 'address', 'adequate', 'adjust', 'adjustment', 'administer', 'administered', 'administration', 'administration government', 'administrations', 'admitted', 'adopt', 'adopted', 'adoption', 'advance', 'advanced', 'advancement', 'advancing', 'advantage', 'advantages', 'affairs', 'affect', 'affecting', 'affection', 'afford', 'afforded', 'age', 'agencies', 'agents', 'ages', 'aggression', 'ago', 'agree', 'agricultural', 'agriculture', 'agriculture commerce', 'aid', 'aids', 'aim', 'alike', 'alliances', 'allies', 'allow', 'allowed', 'almighty', 'almighty god', 'altogether', 'ambition', 

In [38]:
print(cv_array)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 ...
 [0 0 1 ... 1 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 1 0 0]]


In [39]:
cv_df = pd.DataFrame(cv_array, 
                     columns=cv.get_feature_names()).add_prefix('Word_c_')
cv_df.head()

Unnamed: 0,Word_c_abandon,Word_c_abiding,Word_c_ability,Word_c_able,Word_c_abroad,Word_c_absolute,Word_c_abuse,Word_c_abuses,Word_c_accept,Word_c_accepted,...,Word_c_written,Word_c_wrong,Word_c_year,Word_c_years,Word_c_years ago,Word_c_yes,Word_c_yield,Word_c_young,Word_c_zeal,Word_c_zealously
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,1,0,0,...,0,0,2,3,0,0,0,0,1,0
3,1,0,0,0,1,1,0,2,0,0,...,0,2,0,0,0,0,0,0,1,0
4,0,0,0,1,0,0,0,2,0,0,...,1,0,2,2,0,0,0,0,3,0


### 2- TF-IDF Vectorizer

In [42]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate TfidfVectorizer
tv = TfidfVectorizer(stop_words='english',max_df=0.8,min_df=0.1,ngram_range=(1,3))
#min_df : the minimum frequency of words in dataset
#max_df : the maximum frequency of words in dataset
#stop_words: words to ignore (defined above in post)
#ngram_range: explained above
# Fit the vectroizer and transform the data
tv_transformed = tv.fit_transform(text_df['clean_text'])

# Create a DataFrame with these features
tv_df = pd.DataFrame(tv_transformed.toarray(), 
                     columns=tv.get_feature_names()).add_prefix('TFIDF_')
tv_df.head()



Unnamed: 0,TFIDF_abandon,TFIDF_abiding,TFIDF_ability,TFIDF_able,TFIDF_abroad,TFIDF_absolute,TFIDF_abuse,TFIDF_abuses,TFIDF_accept,TFIDF_accepted,...,TFIDF_written,TFIDF_wrong,TFIDF_year,TFIDF_years,TFIDF_years ago,TFIDF_yes,TFIDF_yield,TFIDF_young,TFIDF_zeal,TFIDF_zealously
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.027965,0.0,0.0,0.036379,0.0,0.0,...,0.0,0.0,0.05092,0.049983,0.0,0.0,0.0,0.0,0.037761,0.0
3,0.048435,0.0,0.0,0.0,0.034461,0.046532,0.0,0.08966,0.0,0.0,...,0.0,0.065688,0.0,0.0,0.0,0.0,0.0,0.0,0.046532,0.0
4,0.0,0.0,0.0,0.030796,0.0,0.0,0.0,0.072131,0.0,0.0,...,0.034827,0.0,0.050481,0.033035,0.0,0.0,0.0,0.0,0.112304,0.0
