1: You don't need to replicate every step we did with the new dataset. Just roughly follow the steps in the lab: Data Preparation, Data Transformation, Data Preprocessing, Data Exploration. Pretend you're preparing to actually use that dataset. Make sure you include the steps you need to include.


# Data Preparation

In [70]:
# necessary for when working with external scripts
%load_ext autoreload
%autoreload 2

In [71]:
import pandas as pd
import helpers.data_mining_helpers as dmh

reviews_yelp = pd.read_csv('sentiment labelled sentences/yelp_labelled.txt', sep="\t", names=['Sentence', 'Sentiment'])
reviews_yelp['Source'] = 'Yelp'

reviews_imdb = pd.read_csv('sentiment labelled sentences/imdb_labelled.txt', sep="\t", names=['Sentence', 'Sentiment'])
reviews_imdb['Source'] = 'IMDB'

reviews_amazon = pd.read_csv('sentiment labelled sentences/amazon_cells_labelled.txt', sep="\t", names=['Sentence', 'Sentiment'])
reviews_amazon['Source'] = 'Amazon'

reviews = pd.concat([reviews_yelp, reviews_imdb, reviews_amazon]).reset_index(drop=True)
reviews

Unnamed: 0,Sentence,Sentiment,Source
0,Wow... Loved this place.,1,Yelp
1,Crust is not good.,0,Yelp
2,Not tasty and the texture was just nasty.,0,Yelp
3,Stopped by during the late May bank holiday of...,1,Yelp
4,The selection on the menu was great and so wer...,1,Yelp
...,...,...,...
2743,The screen does get smudged easily because it ...,0,Amazon
2744,What a piece of junk.. I lose more calls on th...,0,Amazon
2745,Item Does Not Match Picture.,0,Amazon
2746,The only thing that disappoint me is the infra...,0,Amazon


# Data Transformation

In [None]:
# dataframe and relevant columns already converted in above step
# doing some queries to become familiar with data

In [66]:
# query one random record
reviews.loc[1800]

Sentence     A usable keyboard actually turns a PDA into a ...
Sentiment                                                    1
Source                                                  Amazon
Name: 1800, dtype: object

In [67]:
# see whole sentence
reviews.loc[1800, 'Sentence']

'A usable keyboard actually turns a PDA into a real-world useful machine instead of just a neat gadget.'

In [59]:
# show every 100th record, to verify all sources are there
reviews.iloc[::100]

Unnamed: 0,Sentence,Sentiment,Source
0,Wow... Loved this place.,1,Yelp
100,Our server was fantastic and when he found out...,1,Yelp
200,"I had heard good things about this place, but ...",1,Yelp
300,Good beer & drink selection and good food sele...,1,Yelp
400,This one is simply a disappointment.,0,Yelp
500,I also had to taste my Mom's multi-grain pumpk...,1,Yelp
600,I miss it and wish they had one in Philadelphia!,1,Yelp
700,The chips that came out were dripping with gre...,0,Yelp
800,I'm super pissd.,0,Yelp
900,Spend your money elsewhere.,0,Yelp


In [68]:
# return every 5 sentences, show top 10
reviews.loc[::5, "Sentence"][:10]

0                              Wow... Loved this place.
5        Now I am getting angry and I want my damn pho.
10                             Service was very prompt.
15    I was shocked because no signs indicate cash o...
20                                  The Burrittos Blah!
25    That's right....the red velvet cake.....ohhh t...
30    Also there are combos like a burger, fries, an...
35    The only redeeming quality of the restaurant w...
40                         The shrimp tender and moist.
45    The only thing I did like was the prime rib an...
Name: Sentence, dtype: object

In [69]:
# check for missing values
reviews.isnull()

Unnamed: 0,Sentence,Sentiment,Source
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
2743,False,False,False
2744,False,False,False
2745,False,False,False
2746,False,False,False


In [72]:
reviews.isnull().apply(lambda x: dmh.check_missing_values(x))

Sentence     (The amoung of missing records is: , 0)
Sentiment    (The amoung of missing records is: , 0)
Source       (The amoung of missing records is: , 0)
dtype: object

In [73]:
# check for duplicates
reviews.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
2743    False
2744    False
2745    False
2746    False
2747    False
Length: 2748, dtype: bool

In [74]:
sum(reviews.duplicated())

17

In [77]:
# show duplicate records
# based on the output, I have decided to KEEP duplicates as it is plausible for different people to give similar reviews.
# Dropping duplicates could affect the integrity of the data if it is 2 different people giving similarly worded reviews
reviews[reviews.duplicated(keep=False)]

Unnamed: 0,Sentence,Sentiment,Source
334,I love this place.,1,Yelp
380,I won't be back.,0,Yelp
383,The food was terrible.,0,Yelp
505,I would not recommend this place.,0,Yelp
814,I love this place.,1,Yelp
816,The food was terrible.,0,Yelp
843,I won't be back.,0,Yelp
846,I would not recommend this place.,0,Yelp
1029,Definitely worth checking out.,1,IMDB
1064,10/10,1,IMDB


In [79]:
reviews.to_csv('reviews.csv')

# Data Preprocessing

In [81]:
# Feature Creation
import nltk
from sklearn.feature_extraction.text import CountVectorizer

reviews['Unigrams'] = reviews['Sentence'].apply(lambda x: dmh.tokenize_text(x))

count_vect = CountVectorizer()
reviews_counts = count_vect.fit_transform(reviews['Sentence'])
analyze = count_vect.build_analyzer()

reviews['Unigrams']

0                       [Wow, ..., Loved, this, place, .]
1                               [Crust, is, not, good, .]
2       [Not, tasty, and, the, texture, was, just, nas...
3       [Stopped, by, during, the, late, May, bank, ho...
4       [The, selection, on, the, menu, was, great, an...
                              ...                        
2743    [The, screen, does, get, smudged, easily, beca...
2744    [What, a, piece, of, junk.., I, lose, more, ca...
2745                 [Item, Does, Not, Match, Picture, .]
2746    [The, only, thing, that, disappoint, me, is, t...
2747    [You, can, not, answer, calls, with, the, unit...
Name: Unigrams, Length: 2748, dtype: object

In [83]:
# Feature Subset Selection: Create Document-Term Matrix
matrix = pd.DataFrame(reviews_counts.toarray(), 
                      columns=count_vect.get_feature_names())

matrix

Unnamed: 0,00,10,100,11,12,13,15,15g,15pm,17,...,yucky,yukon,yum,yummy,yun,z500a,zero,zillion,zombie,zombiez
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2743,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2744,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2745,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2746,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# cleaning data

# cleaning the data

# remove numbers: for the purpose of this example, we will assume that numbers are not relevant for the analysis
termfreq = termfreq.mask(termfreq["term"].str.isnumeric()).dropna().reset_index(drop=True)

# remove _ at the beginning and end of terms, or words that are just ___ they are not relevant to the analysis

# lowercasing wordz

# takes one minute
for index, word in enumerate(termfreq['term']):
    termfreq['term'][index] = word.strip('_')

termfreq = termfreq[termfreq['term'] != '']


# lemmatize
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 


lemmatizer=WordNetLemmatizer()
termfreq['term'] = termfreq['term'].apply(lambda x: lemmatizer.lemmatize(x))

# remove stop words
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
termfreq = termfreq[~termfreq['term'].isin(stop_words)]


# after cleaning, aggregate common terms
termfreq = termfreq.groupby('term').aggregate({'freq': 'sum'})
termfreq.shape
# we were able to reduce it from ~36K terms to ~29.5K terms

In [None]:
# Atrribute Transformation / Aggregation