In [1]:
import pandas as pd
import re  # For preprocessing
import pandas as pd  # For data handling
import spacy  # For preprocessing
import numpy as np
from langdetect import detect
import swifter

In [2]:
def detect_language(text):
    try:
        lang = detect(text)
    except:
        lang = 'unknown'
    return lang


In [3]:
path = '/Users/shafz/OneDrive/Documents/deep-learning-final-project-yelp_reviews_classification/data/raw'
tweets = pd.read_csv(path+'/ExtractedTweets.csv')
tweets.head(5)

Unnamed: 0,Party,Handle,Tweet
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P..."
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...


In [4]:
tweets['Party'].value_counts()

Republican    44392
Democrat      42068
Name: Party, dtype: int64

In [5]:
tweets['Handle'].value_counts()

RepDarrenSoto     200
RepMarkMeadows    200
RepDaveJoyce      200
RodneyDavis       200
RepLukeMesser     200
                 ... 
HouseAdmnGOP      199
RepEliotEngel     199
Jim_Jordan        197
RepVisclosky      197
collinpeterson     80
Name: Handle, Length: 433, dtype: int64

# Data Cleaning 
Lets take a look at some reviews to see what data cleaning measures we need 
* We can see that we need to remove punctuation and that some reviews have a lot of white space 
* Both uppercase and lowercase letters are present 
* There are numerical digits as well 
* Some reviews contain expressions such as "this/that" and simply removing '/' results in "thisthat" instead of "this that" 
* Some reviews contained only punctuation (e.g. '.', ': )', '*') 
* Some reviews do not have proper spacing (e.g. 'Don't misinterpret my review....I' which turns into 'dont misinterpret my reviewi')
* There are reviews in Chinese and Spanish 
* Had some strange letters show up such as 'entrÃ©es' due to encoding/decoding issues
* Some reviews have website links in them 

In [6]:
for index,text in enumerate(tweets['Tweet'][:5]):
  print('Review %d:\n'%(index+1),text)
for index,text in enumerate(tweets['Tweet'][-5:]):
  print('Review %d:\n'%(index+1),text)

Review 1:
 Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L
Review 2:
 RT @WinterHavenSun: Winter Haven resident / Alta Vista teacher is one of several recognized by @RepDarrenSoto for National Teacher Apprecia…
Review 3:
 RT @NBCLatino: .@RepDarrenSoto noted that Hurricane Maria has left approximately $90 billion in damages. 

Congress has allocated about $18…
Review 4:
 RT @NALCABPolicy: Meeting with @RepDarrenSoto . Thanks for taking the time to meet with @LatinoLeader ED Marucci Guzman. #NALCABPolicy2018.…
Review 5:
 RT @Vegalteno: Hurricane season starts on June 1st; Puerto Rico’s readiness...well 🤦🏼‍♂️😡😩@Pwr4PuertoRico @RepDarrenSoto @EspaillatNY
Review 1:
 Check out my op-ed on need for End Executive Overreach Act: The White House is crippling our economy https://t.co/XCmjLB8Qyd via @DCExaminer
Review 2:
 Yesterday, Betty &amp; I had a great time learning about the forestry industry which 

In [7]:
tweets['cleaned'] = tweets['Tweet'].copy().str.normalize('NFKD')\
       .str.encode('ascii', errors='ignore')\
       .str.decode('utf-8')

In [45]:
tweets['language'] = tweets['Tweet'].swifter.apply(detect_language)

Pandas Apply: 100%|██████████| 85729/85729 [06:48<00:00, 209.77it/s]


In [46]:
tweets.loc[tweets['language'] != 'en']

Unnamed: 0,Party,Handle,Tweet,cleaned,lemmatized,language
657,Democrat,RepEspaillat,ICYMI- @RepEspaillat @NydiaVelazquez @RepJeffr...,icymi join nycha residents local leaders prote...,icymi join nycha resident local leader protest...,fr
798,Democrat,RepEspaillat,RT @THEKIDMERO: METE MANO TIO!! 🇩🇴💪🏽 https://t...,thekidmero mete mano tio,thekidmero mete mano tio,pt
935,Democrat,RepBRochester,👇🏾#GetCovered https://t.co/nwj5i9185l,getcovered,getcovere,da
1204,Democrat,RepTomSuozzi,@randimarshall @Amtrak Thank you!,randimarshall amtrak thank you,randimarshall amtrak thank,id
1541,Democrat,RepRoKhanna,Thanks for having me! https://t.co/QiOzWvboqZ,thanks for having me,thank have,da
...,...,...,...,...,...,...
85621,Republican,RobWittman,"Stay safe today, folks. https://t.co/FPeYJBCho...",stay safe today folks,stay safe today folk,so
85853,Republican,RosLehtinen,Thx @WeAreALPA! Safer skies benefit us all! ht...,thx wearealpa safer skies benefit us all,thx wearealpa safe sky benefit,de
85902,Republican,WaysandMeansGOP,RT @PeterRoskam: https://t.co/ETliUWGvJc,,,sv
86069,Republican,GOPpolicy,We will #NeverForget. https://t.co/097kB5vI5Q,we will neverforget,neverforget,nl


In [47]:
tweets = tweets[tweets['language'] == 'en']

In [48]:
tweets.drop('language', axis = 1, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [49]:
## Expanding Contractions 
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}

contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))


def expand_contractions(text,contractions_dict=contractions_dict):
  def replace(match):
    return contractions_dict[match.group(0)]
  return contractions_re.sub(replace, text)

In [59]:
# expanding contractions 
tweets['cleaned'] = tweets['Tweet'].apply(lambda x:expand_contractions(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['cleaned'] = tweets['Tweet'].apply(lambda x:expand_contractions(x))


In [60]:
## Making all words lowercase, removing punctuation, URLs, and white spaces 
tweets['cleaned']  = tweets['cleaned'].apply(lambda x: re.sub(r'https?://\S+|www\.\S+', ' ', x))
tweets['cleaned']  = tweets['cleaned'].str.lower().apply(lambda x: re.sub(r"[\d\n\-\./]+", ' ', x))
tweets['cleaned']  = tweets['cleaned'].apply(lambda x: re.sub(' +',' ',x))
tweets['cleaned']  = tweets['cleaned'].apply(lambda x: re.sub(r'[^\w\s]', '', x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['cleaned']  = tweets['cleaned'].apply(lambda x: re.sub(r'https?://\S+|www\.\S+', ' ', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['cleaned']  = tweets['cleaned'].str.lower().apply(lambda x: re.sub(r"[\d\n\-\./]+", ' ', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['

In [61]:
tweets['cleaned']  = tweets['cleaned'].replace(' ', np.nan)
tweets['cleaned']  = tweets['cleaned'].replace('', np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['cleaned']  = tweets['cleaned'].replace(' ', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['cleaned']  = tweets['cleaned'].replace('', np.nan)


In [62]:
# apply the lambda function to the 'text' column using the apply() method
tweets['cleaned'] = tweets['cleaned'].apply(lambda x: x.replace('rt ', '').strip())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['cleaned'] = tweets['cleaned'].apply(lambda x: x.replace('rt ', '').strip())


In [63]:
def replace_word(text, words_to_check):
    for word in words_to_check:
        if word in text: 
            text = text.replace(word, '')
    return text


words_to_check = list(pd.Series(tweets['Handle'].unique()).str.lower())
# apply the function to the 'text' column
tweets['cleaned'] = tweets['cleaned'].swifter.apply(lambda x: replace_word(x, words_to_check))

Pandas Apply: 100%|██████████| 85117/85117 [00:04<00:00, 20650.37it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['cleaned'] = tweets['cleaned'].swifter.apply(lambda x: replace_word(x, words_to_check))


In [64]:
tweets.isna().sum()

Party         0
Handle        0
Tweet         0
cleaned       0
lemmatized    0
dtype: int64

In [65]:
tweets.loc[tweets['cleaned'].isna()]

Unnamed: 0,Party,Handle,Tweet,cleaned,lemmatized


In [66]:
tweets.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [67]:
tweets['cleaned'] = tweets['cleaned'].apply(lambda x: re.sub(' +',' ',x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['cleaned'] = tweets['cleaned'].apply(lambda x: re.sub(' +',' ',x))


In [68]:
for index,text in enumerate(tweets['cleaned'][:5]):
  print('Review %d:\n'%(index+1),text)
for index,text in enumerate(tweets['cleaned'][-5:]):
  print('Review %d:\n'%(index+1),text)


Review 1:
 today senate dems vote to savetheinternet proud to supposimilar netneutrality legislation here in the house
Review 2:
 winterhavensun winter haven resident alta vista teacher is one of several recognized by for national teacher apprecia
Review 3:
 nbclatino noted that hurricane maria has left approximately billion in damages congress has allocated about
Review 4:
 nalcabpolicy meeting with thanks for taking the time to meet with latinoleader ed marucci guzman nalcabpolicy
Review 5:
 vegalteno hurricane season starts on june st puerto ricos readiness well pwr puertorico espaillatny
Review 1:
 check out my op ed on need for end executive overreach act the white house is crippling our economy via dcexaminer
Review 2:
 yesterday betty amp i had a great time learning about the forestry industry which employs approx people in ga
Review 3:
 we are forever grateful for the service and sacrifice of major barney
Review 4:
 happy first day of school cobbschools cobbbacktoschool
Review 

In [69]:
tweets.head()

Unnamed: 0,Party,Handle,Tweet,cleaned,lemmatized
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P...",today senate dems vote to savetheinternet prou...,today senate dem vote savetheinternet proud su...
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...,winterhavensun winter haven resident alta vist...,winterhavensun winter haven resident alta vist...
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...,nbclatino noted that hurricane maria has left ...,nbclatino note hurricane maria leave approxima...
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...,nalcabpolicy meeting with thanks for taking th...,nalcabpolicy meeting thank take time meet lati...
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...,vegalteno hurricane season starts on june st p...,vegalteno hurricane season start june st puert...


In [70]:
tweets.to_csv('./tweets_cleaned.csv', index=False)

Removing stop words, lemmatizing, and tokenizing 


In [71]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

In [72]:
print(nlp.Defaults.stop_words)
len(nlp.Defaults.stop_words)

{'one', "n't", 'his', 'because', 'often', 'part', 'yours', 'we', 'moreover', 'for', 'every', 'front', 'been', 'she', 'twelve', 'not', 'of', 'since', 'your', 'whenever', 'sixty', 'see', 'he', 'few', 'therefore', 'my', 'why', 'make', 'seems', 'nor', 'doing', 'might', 'into', 'alone', 'some', 'me', 'whither', 'off', 'take', 'or', 'could', 'will', 'whereafter', 'was', 'although', 'their', 'him', 'have', 'wherein', 'own', 'last', 'always', 'anyway', 'than', 'is', 'up', 'toward', 'her', 'more', 'be', 'put', 'may', 'another', 'fifty', 'whom', 'which', 'whose', 'back', 'did', 'say', 'whether', 'everything', 'throughout', 'when', 'such', 'somehow', 'himself', 'does', 'nevertheless', 'something', 'by', '’ve', 'too', 'seemed', '’s', 'bottom', 'cannot', 'themselves', 'upon', 'no', 'here', "'m", 'would', 'myself', 'rather', 'them', 'down', 'behind', 'formerly', 'anywhere', 'those', 'former', 'less', 'various', 'out', 'nine', 'thus', 'empty', 'thereby', 'nothing', 're', 'the', 'an', 'two', "'ve", 'd

326

In [74]:
nlp.Defaults.stop_words -= {'not'}
len(nlp.Defaults.stop_words)

325

In [75]:
texts = tweets['cleaned'].tolist()
lemmatized_texts = []
for doc in nlp.pipe(texts, batch_size=1000, n_process=4):
    lemmatized_texts.append(' '.join([token.lemma_ for token in doc if (token.is_stop==False)]))

In [76]:
tweets['lemmatized'] = lemmatized_texts

In [77]:
tweets.isna().sum()

Party         0
Handle        0
Tweet         0
cleaned       0
lemmatized    0
dtype: int64

In [78]:
tweets.loc[tweets['lemmatized'].isna()]

Unnamed: 0,Party,Handle,Tweet,cleaned,lemmatized


In [79]:
tweets.dropna(inplace=True)

In [80]:
tweets.head(10)

Unnamed: 0,Party,Handle,Tweet,cleaned,lemmatized
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P...",today senate dems vote to savetheinternet prou...,today senate dem vote savetheinternet proud su...
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...,winterhavensun winter haven resident alta vist...,winterhavensun winter haven resident alta vist...
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...,nbclatino noted that hurricane maria has left ...,nbclatino note hurricane maria leave approxima...
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...,nalcabpolicy meeting with thanks for taking th...,nalcabpolicy meeting thank take time meet lati...
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...,vegalteno hurricane season starts on june st p...,vegalteno hurricane season start june st puert...
5,Democrat,RepDarrenSoto,RT @EmgageActionFL: Thank you to all who came ...,emgageactionfl thank you to all who came out t...,emgageactionfl thank come orlando gala success...
6,Democrat,RepDarrenSoto,Hurricane Maria left approx $90 billion in dam...,hurricane maria left approx billion in damages...,hurricane maria leave approx billion damage bi...
7,Democrat,RepDarrenSoto,RT @Tharryry: I am delighted that @RepDarrenSo...,tharryry i am delighted that will be voting fo...,tharryry delighted vote cra overrule fcc save ...
8,Democrat,RepDarrenSoto,RT @HispanicCaucus: Trump's anti-immigrant pol...,hispaniccaucus trump is anti immigrant policie...,hispaniccaucus trump anti immigrant policy hur...
9,Democrat,RepDarrenSoto,RT @RepStephMurphy: Great joining @WeAreUnidos...,great joining weareunidosus and for a roundta...,great join weareunidosus roundtable orlando ...


In [81]:
tweets.to_csv('./lemmatized_tweets.csv')