In [1]:
# Load necessary libraries
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import pos_tag, pos_tag_sents
from nltk import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vinaynagaraj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vinaynagaraj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/vinaynagaraj/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# Read the controversial-comments.jsonl file into a DataFrame
cntr_cmts = pd.read_json(r'controversial-comments.jsonl', lines=True)

In [3]:
cntr_cmts.head()

Unnamed: 0,con,txt
0,0,Well it's great that he did something about th...
1,0,You are right Mr. President.
2,0,You have given no input apart from saying I am...
3,0,I get the frustration but the reason they want...
4,0,I am far from an expert on TPP and I would ten...


In [4]:
cntr_cmts_df = cntr_cmts.sample(n=50000)

In [5]:
# 1.A Convert all text to lowercase letters.
cntr_cmts_df['txt'] = cntr_cmts_df['txt'].str.lower()
cntr_cmts_df

Unnamed: 0,con,txt
763661,0,"nah, plenty of space seeing how big and empty ..."
61114,0,"lol. actually, i'm an independent voter. but..."
704882,0,"ah, so you're perpetuating the cycle. at least..."
246545,0,they started a gofundme on the_d and despite o...
633702,0,can't argue that the courts can change their m...
...,...,...
919582,0,just have to last 4 more years until president...
297104,0,agreed hence i'm more concerned about the sena...
278394,0,the msnbc crew is sad...
889109,0,maybe intel briefings should be in the form of...


In [6]:
# 1.B Remove all punctuation from the text.
cntr_cmts_df['txt'] = cntr_cmts_df['txt'].str.replace('[^\w\s]','')
cntr_cmts_df

Unnamed: 0,con,txt
763661,0,nah plenty of space seeing how big and empty i...
61114,0,lol actually im an independent voter but you...
704882,0,ah so youre perpetuating the cycle at least a ...
246545,0,they started a gofundme on the_d and despite o...
633702,0,cant argue that the courts can change their mi...
...,...,...
919582,0,just have to last 4 more years until president...
297104,0,agreed hence im more concerned about the senat...
278394,0,the msnbc crew is sad
889109,0,maybe intel briefings should be in the form of...


In [7]:
# 1.C Remove stop words.
stop = stopwords.words('english')

In [8]:
cntr_cmts_df['txt'] = cntr_cmts_df['txt'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop]))
cntr_cmts_df

Unnamed: 0,con,txt
763661,0,nah plenty space seeing big empty
61114,0,lol actually im independent voter know come li...
704882,0,ah youre perpetuating cycle least burger flipp...
246545,0,started gofundme the_d despite 5k upvotes 83 p...
633702,0,cant argue courts change mind could thing roe ...
...,...,...
919582,0,last 4 years president clinton takes
297104,0,agreed hence im concerned senate congress pres...
278394,0,msnbc crew sad
889109,0,maybe intel briefings form tv show


In [9]:
# 1.D. Apply NLTK’s PorterStemmer
porter_stemmer = PorterStemmer()
def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)
cntr_cmts_df['txt_stem'] = cntr_cmts_df['txt'].apply(stem_sentences)
cntr_cmts_df

Unnamed: 0,con,txt,txt_stem
763661,0,nah plenty space seeing big empty,nah plenti space see big empti
61114,0,lol actually im independent voter know come li...,lol actual im independ voter know come littl n...
704882,0,ah youre perpetuating cycle least burger flipp...,ah your perpetu cycl least burger flipper some...
246545,0,started gofundme the_d despite 5k upvotes 83 p...,start gofundm the_d despit 5k upvot 83 peopl c...
633702,0,cant argue courts change mind could thing roe ...,cant argu court chang mind could thing roe v w...
...,...,...,...
919582,0,last 4 years president clinton takes,last 4 year presid clinton take
297104,0,agreed hence im concerned senate congress pres...,agre henc im concern senat congress presid unf...
278394,0,msnbc crew sad,msnbc crew sad
889109,0,maybe intel briefings form tv show,mayb intel brief form tv show


**2. Now that the data is pre-processed, you will apply three different techniques to get it into a usable form for model-building. Apply each of the following steps (individually) to the pre-processed data.**

In [10]:
# A. Convert each text entry into a word-count vector (see sections 5.3 & 6.8 in the Machine Learning with Python Cookbook).
cv = CountVectorizer()
cv.fit(cntr_cmts_df['txt_stem'])

word_countv = cv.transform(cntr_cmts_df['txt_stem'])

In [11]:
# Load word count vector to datframe
features = cv.get_feature_names()
cntr_cmts_df_wc = pd.DataFrame(word_countv.toarray(), columns=features)
cntr_cmts_df_wc

Unnamed: 0,00,000,00000005664599088,00000007,000001,00001,000025,00003,00004,000053,...,юccp,яepublican,яich,яs,яthe_donald,الصحاف,سعيد,محمد,ᵗʰᵉʳᵉ,なるほど
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# B. Convert each text entry into a part-of-speech tag vector (see section 6.7 in the Machine Learning with Python Cookbook).
text_entry = cntr_cmts_df['txt_stem'].tolist()
text_entry

['nah plenti space see big empti',
 'lol actual im independ voter know come littl narr nah ive gotten death threat laugh peopl get offend look away easi',
 'ah your perpetu cycl least burger flipper someth use',
 'start gofundm the_d despit 5k upvot 83 peopl contribut embarrass show upvot bot user shut forward donat page church start someon els let find read edit httpswwwredditcomrenoughtrumpspamcomments5avnmethe_donald_hosted_a_fundraiser_and_then_takes_it',
 'cant argu court chang mind could thing roe v wade im gonna spout abort illeg case state current definit highest court rule interpret second everyon seem believ militia staterun conflict mani right grant god seen bill right 1 act selfdefens person famili innoc nation 2 carri weapon selfdefens ensur nation remain free 7 associ disassoci person group 16 abolish said govern becom destruct right histor nra absolut 0 interpret wellregul wellregul 1776 even mean someth proper work condit militia also defin legal 10 us code 311a read mi

In [13]:
map(word_tokenize, text_entry)

<map at 0x1a1c8785d0>

In [14]:
cntr_cmts_df['txt_stem'].apply(word_tokenize)

763661                [nah, plenti, space, see, big, empti]
61114     [lol, actual, im, independ, voter, know, come,...
704882    [ah, your, perpetu, cycl, least, burger, flipp...
246545    [start, gofundm, the_d, despit, 5k, upvot, 83,...
633702    [cant, argu, court, chang, mind, could, thing,...
                                ...                        
919582               [last, 4, year, presid, clinton, take]
297104    [agre, henc, im, concern, senat, congress, pre...
278394                                   [msnbc, crew, sad]
889109                 [mayb, intel, brief, form, tv, show]
824017    [pretti, much, insult, anyon, doesnt, agre, wi...
Name: txt_stem, Length: 50000, dtype: object

In [15]:
cntr_cmts_df['txt_stem'].apply(word_tokenize).tolist()

[['nah', 'plenti', 'space', 'see', 'big', 'empti'],
 ['lol',
  'actual',
  'im',
  'independ',
  'voter',
  'know',
  'come',
  'littl',
  'narr',
  'nah',
  'ive',
  'gotten',
  'death',
  'threat',
  'laugh',
  'peopl',
  'get',
  'offend',
  'look',
  'away',
  'easi'],
 ['ah',
  'your',
  'perpetu',
  'cycl',
  'least',
  'burger',
  'flipper',
  'someth',
  'use'],
 ['start',
  'gofundm',
  'the_d',
  'despit',
  '5k',
  'upvot',
  '83',
  'peopl',
  'contribut',
  'embarrass',
  'show',
  'upvot',
  'bot',
  'user',
  'shut',
  'forward',
  'donat',
  'page',
  'church',
  'start',
  'someon',
  'els',
  'let',
  'find',
  'read',
  'edit',
  'httpswwwredditcomrenoughtrumpspamcomments5avnmethe_donald_hosted_a_fundraiser_and_then_takes_it'],
 ['cant',
  'argu',
  'court',
  'chang',
  'mind',
  'could',
  'thing',
  'roe',
  'v',
  'wade',
  'im',
  'gon',
  'na',
  'spout',
  'abort',
  'illeg',
  'case',
  'state',
  'current',
  'definit',
  'highest',
  'court',
  'rule',
  'i

In [16]:
pos_tag_sents( cntr_cmts_df['txt_stem'].apply(word_tokenize).tolist() )

[[('nah', 'RB'),
  ('plenti', 'JJ'),
  ('space', 'NN'),
  ('see', 'VBP'),
  ('big', 'JJ'),
  ('empti', 'NN')],
 [('lol', 'JJ'),
  ('actual', 'JJ'),
  ('im', 'NN'),
  ('independ', 'VBP'),
  ('voter', 'NN'),
  ('know', 'VBP'),
  ('come', 'VBN'),
  ('littl', 'JJ'),
  ('narr', 'JJ'),
  ('nah', 'JJ'),
  ('ive', 'JJ'),
  ('gotten', 'VBN'),
  ('death', 'NN'),
  ('threat', 'NN'),
  ('laugh', 'IN'),
  ('peopl', 'NN'),
  ('get', 'VBP'),
  ('offend', 'JJ'),
  ('look', 'VB'),
  ('away', 'RP'),
  ('easi', 'NN')],
 [('ah', 'RB'),
  ('your', 'PRP$'),
  ('perpetu', 'NN'),
  ('cycl', 'NN'),
  ('least', 'JJS'),
  ('burger', 'JJ'),
  ('flipper', 'NN'),
  ('someth', 'NN'),
  ('use', 'NN')],
 [('start', 'NN'),
  ('gofundm', 'NN'),
  ('the_d', 'NN'),
  ('despit', 'NN'),
  ('5k', 'CD'),
  ('upvot', 'JJ'),
  ('83', 'CD'),
  ('peopl', 'NN'),
  ('contribut', 'NN'),
  ('embarrass', 'NN'),
  ('show', 'NN'),
  ('upvot', 'JJ'),
  ('bot', 'NN'),
  ('user', 'NN'),
  ('shut', 'VBD'),
  ('forward', 'RB'),
  ('donat', '

In [17]:
cntr_cmts_df['pos'] = pos_tag_sents( cntr_cmts_df['txt_stem'].apply(word_tokenize).tolist() )
cntr_cmts_df

Unnamed: 0,con,txt,txt_stem,pos
763661,0,nah plenty space seeing big empty,nah plenti space see big empti,"[(nah, RB), (plenti, JJ), (space, NN), (see, V..."
61114,0,lol actually im independent voter know come li...,lol actual im independ voter know come littl n...,"[(lol, JJ), (actual, JJ), (im, NN), (independ,..."
704882,0,ah youre perpetuating cycle least burger flipp...,ah your perpetu cycl least burger flipper some...,"[(ah, RB), (your, PRP$), (perpetu, NN), (cycl,..."
246545,0,started gofundme the_d despite 5k upvotes 83 p...,start gofundm the_d despit 5k upvot 83 peopl c...,"[(start, NN), (gofundm, NN), (the_d, NN), (des..."
633702,0,cant argue courts change mind could thing roe ...,cant argu court chang mind could thing roe v w...,"[(cant, JJ), (argu, JJ), (court, NN), (chang, ..."
...,...,...,...,...
919582,0,last 4 years president clinton takes,last 4 year presid clinton take,"[(last, JJ), (4, CD), (year, NN), (presid, NN)..."
297104,0,agreed hence im concerned senate congress pres...,agre henc im concern senat congress presid unf...,"[(agre, NN), (henc, NN), (im, NN), (concern, N..."
278394,0,msnbc crew sad,msnbc crew sad,"[(msnbc, NN), (crew, VBD), (sad, JJ)]"
889109,0,maybe intel briefings form tv show,mayb intel brief form tv show,"[(mayb, JJ), (intel, NN), (brief, JJ), (form, ..."


In [18]:
# C. Convert each entry into a term frequency-inverse document frequency (tfidf) vector (see section 6.9 in the Machine Learning with Python Cookbook).
tfidf = TfidfVectorizer()

In [19]:
# Due to memory issue, let's sort 10000 random records from dataframe
cntr_cmts_df1 = cntr_cmts_df.sample(n=10000)

In [20]:
text_entry2 = cntr_cmts_df1['txt_stem'].tolist()
termfreq_matrix = tfidf.fit_transform(text_entry2)
termfreq_matrix

<10000x13813 sparse matrix of type '<class 'numpy.float64'>'
	with 163063 stored elements in Compressed Sparse Row format>

In [21]:
termfreq_matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [22]:
tfidf.vocabulary_

{'trump': 12540,
 'liter': 7408,
 'paid': 8945,
 'campaign': 2094,
 'advis': 690,
 'cnn': 2506,
 'wouldnt': 13640,
 'get': 4969,
 'work': 13609,
 'mayb': 7750,
 'plummet': 9330,
 'crime': 2998,
 'leav': 7249,
 'mass': 7710,
 'shooter': 11003,
 'aliv': 815,
 'carri': 2157,
 'nefari': 8340,
 'deed': 3227,
 'delet': 3269,
 'dem': 3282,
 'need': 8334,
 'new': 8378,
 'blood': 1713,
 'usernam': 13013,
 'relev': 10170,
 'wasnt': 13295,
 'kelli': 7046,
 'ann': 955,
 'curt': 3084,
 'shill': 10960,
 'remov': 10197,
 'dont': 3671,
 'licens': 7342,
 'pizza': 9283,
 'deliveri': 3275,
 'man': 7636,
 'drive': 3759,
 'allow': 827,
 'cannot': 2117,
 'becom': 1490,
 'teacher': 12071,
 'without': 13565,
 'first': 4535,
 'certifi': 2240,
 'ambul': 878,
 'driver': 3763,
 'despit': 3365,
 'trudeau': 12533,
 'teach': 12070,
 'offici': 8663,
 'educ': 3896,
 'public': 9761,
 'school': 10699,
 'children': 2344,
 'teen': 12085,
 'therefor': 12197,
 'technic': 12078,
 'though': 12243,
 'current': 3080,
 'capac': 

In [23]:
# Follow-Up Question
# For the three techniques in problem (2) above, give an example where each would be useful.

Word count Vector is used to find doucument similarity. For example, if there are two articles with related to football, then word counts of the two documents are calculated separted and the document similarity is calcuated by multiplying word count values and summing the result. If the final value is more then the documents are related. It also used to find the user interests based on his previous searches.

Parts of Speech tag vector is used in Natural Language Processing (NLP). NLP is best suited for customer service where call customer call recording can be analyzed to find emotion of the customer. For example, consider sentences 'He saw a bear' and 'His efforts will bear fruit'. The word bear has different meanings and POS tag vector can be used to identify the same.

Term Frequency Inverse Document Frequency (TFIDF) is mainly used in search engines. If TFIDF is closer to 0 then then word is more common. If any word is searched in search engine, TFIDF value is calcuated and results are ranked on how frequenctly the word appear and the result with highly ranked with be shown at the top.