In [1]:
import pandas as pd
import re  # For preprocessing
import pandas as pd  # For data handling
import spacy  # For preprocessing
import numpy as np
from langdetect import detect
import swifter

In [2]:
path = '/Users/shafz/OneDrive/Documents/deep-learning-final-project-yelp_reviews_classification/data/raw'
tweets = pd.read_csv(path+'/reddit_raw.csv')
tweets = tweets.drop('Unnamed: 0', axis = 1)
tweets.head(5)

Unnamed: 0,text,subreddit,bucket,annotator,annotation,confidence
0,That particular part of the debate is especial...,europe,French politics,annotator03,Non-Moral,Confident
1,That particular part of the debate is especial...,europe,French politics,annotator01,Purity,Confident
2,That particular part of the debate is especial...,europe,French politics,annotator02,Thin Morality,Confident
3,"/r/france is pretty lively, with it's own ling...",europe,French politics,annotator03,Non-Moral,Confident
4,"/r/france is pretty lively, with it's own ling...",europe,French politics,annotator00,Non-Moral,Somewhat Confident


# Data Cleaning 
Lets take a look at some reviews to see what data cleaning measures we need 
* We can see that we need to remove punctuation and that some reviews have a lot of white space 
* Both uppercase and lowercase letters are present 
* There are numerical digits as well 
* Some reviews contain expressions such as "this/that" and simply removing '/' results in "thisthat" instead of "this that" 
* Some reviews contained only punctuation (e.g. '.', ': )', '*') 
* Some reviews do not have proper spacing (e.g. 'Don't misinterpret my review....I' which turns into 'dont misinterpret my reviewi')
* There are reviews in Chinese and Spanish 
* Had some strange letters show up such as 'entrÃ©es' due to encoding/decoding issues
* Some reviews have website links in them 

In [53]:
for index,text in enumerate(tweets['text'][:5]):
  print('Review %d:\n'%(index+1),text)
for index,text in enumerate(tweets['text'][-5:]):
  print('Review %d:\n'%(index+1),text)

Review 1:
 That particular part of the debate is especially funny. Macron was explaining he did not think FN voters were evil and that from where he comes from he knew many, and she was like "ooooh the evil FN invaders they're everywhere...!" Self-awareness: zero.
Review 2:
 That particular part of the debate is especially funny. Macron was explaining he did not think FN voters were evil and that from where he comes from he knew many, and she was like "ooooh the evil FN invaders they're everywhere...!" Self-awareness: zero.
Review 3:
 That particular part of the debate is especially funny. Macron was explaining he did not think FN voters were evil and that from where he comes from he knew many, and she was like "ooooh the evil FN invaders they're everywhere...!" Self-awareness: zero.
Review 4:
 /r/france is pretty lively, with it's own lingo that is usually deliberately bad translations - shitpost is "cacapoteau" or "compost", crosspost is "croixpoteau", etc. There's a mix of memes, po

In [54]:
## Expanding Contractions 
contractions_dict = { "ain't": "are not ","'s":" is ","aren't": "are not ",
                     "can't": "cannot ","can't've": "cannot have ",
                     "'cause": "because ","could've": "could have ","couldn't": "could not ",
                     "didn't": "did not ","doesn't": "does not ",
                     "don't": "do not ","hadn't": "had not ",
                     "hasn't": "has not ","haven't": "have not ","he'd": "he would ",
                     "he'd've": "he would have ","he'll ": "he will ",
                     "how'd": "how did ","how'll": "how will ",
                     "I'd": "I would ", "I'll ": "I will ",
                     "I'm": "I am ","I've": "I have ", "isn't": "is not ",
                     "it'd": "it would ","it'll": "it will ",
                     "let's": "let us ","ma'am": "madam ",
                     "might've": "might have ","mightn't": "might not ", 
                     "must've": "must have ","mustn't": "must not ",
                     "needn't": "need not ", "shan't": "shall not" ,
                     "she'd": "she would ",
                     "she'll": "she will ", "should've": "should have ",
                     "shouldn't": "should not ", 
                     "that'd": "that would ",  "there'd": "there would ",
                     "they'd": "they would ",
                     "they'll": "they will ",
                     "they're": "they are ","they've": "they have ",
                     "to've": "to have ","wasn't": "was not ","we'd": "we would ",
                     "we'll": "we will ",
                     "we're": "we are ","we've": "we have ", "weren't": "were not ","what'll": "what will ",
                     "what're": "what are ", "what've": "what have ",
                     "when've": "when have ","where'd": "where did ", "where've": "where have ",
                     "who'll": "who will ","who've": "who have ",
                     "why've": "why have ","will've": "will have ","won't": "will not ",
                     "would've": "would have ","wouldn't": "would not ",
                     "y'all": "you all ", "you'd": "you would ",
                     "you'll": "you will ", "you're": "you are ",
                     "you've": "you have "}

contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))


def expand_contractions(text,contractions_dict=contractions_dict):
  def replace(match):
    return contractions_dict[match.group(0)]
  return contractions_re.sub(replace, text)

In [55]:
# expanding contractions 
tweets['cleaned'] = tweets['text'].apply(lambda x:expand_contractions(x))

In [56]:
tweets['cleaned'][-5:]

61221    Well I can discern from your vehemence toward ...
61222    Kick! Punch! It is  all in the mind. If you wa...
61223    Reddit can’t help you this is some seriously t...
61224    Yes. Disordered eating is insidious. And Rita ...
61225    What parent would let a kid bring a Gameboy to...
Name: cleaned, dtype: object

In [57]:
## Making all words lowercase, removing punctuation, URLs, and white spaces 
tweets['cleaned'] = tweets['cleaned'].apply(lambda x: x.replace('/r/', '').strip())
tweets['cleaned']  = tweets['cleaned'].apply(lambda x: re.sub(r'https?://\S+|www\.\S+', ' ', x))
tweets['cleaned']  = tweets['cleaned'].str.lower().apply(lambda x: re.sub(r"[\d\n\-\./]+", ' ', x))
tweets['cleaned']  = tweets['cleaned'].apply(lambda x: re.sub(' +',' ',x))
tweets['cleaned']  = tweets['cleaned'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
tweets['cleaned'] = tweets['cleaned'].apply(lambda x: x.replace('op ', '').strip())
tweets['cleaned'] = tweets['cleaned'].apply(lambda x: x.replace('gt ', '').strip())
tweets['cleaned']  = tweets['cleaned'].replace(' ', np.nan)
tweets['cleaned']  = tweets['cleaned'].replace('', np.nan)
# tweets['cleaned'] = tweets['cleaned'].apply(lambda x: re.sub(' +',' ',x))

In [58]:
tweets.tail()

Unnamed: 0,text,subreddit,bucket,annotator,annotation,confidence,cleaned
61221,Well I can discern from your vehemence toward ...,AmItheAsshole,Everyday Morality,annotator05,Equality,Confident,well i can discern from your vehemence toward ...
61222,Kick! Punch! It's all in the mind. If you wann...,nostalgia,Everyday Morality,annotator05,Thin Morality,Somewhat Confident,kick punch it is all in the mind if you wanna ...
61223,Reddit can’t help you this is some seriously t...,confession,Everyday Morality,annotator05,Thin Morality,Confident,reddit cant help you this is some seriously tr...
61224,Yes. Disordered eating is insidious. And Rita ...,AmItheAsshole,Everyday Morality,annotator05,Non-Moral,Somewhat Confident,yes disordered eating is insidious and rita ne...
61225,What parent would let a kid bring a Gameboy to...,nostalgia,Everyday Morality,annotator05,Authority,Somewhat Confident,what parent would let a kid bring a gameboy to...


In [59]:
tweets.isna().sum()

text           0
subreddit      0
bucket         0
annotator      0
annotation     0
confidence    42
cleaned        0
dtype: int64

In [60]:
for index,text in enumerate(tweets['cleaned'][:5]):
  print('Review %d:\n'%(index+1),text)
for index,text in enumerate(tweets['cleaned'][-5:]):
  print('Review %d:\n'%(index+1),text)

Review 1:
 that particular part of the debate is especially funny macron was explaining he did not think fn voters were evil and that from where he comes from he knew many and she was like ooooh the evil fn invaders they are everywhere  self awareness zero
Review 2:
 that particular part of the debate is especially funny macron was explaining he did not think fn voters were evil and that from where he comes from he knew many and she was like ooooh the evil fn invaders they are everywhere  self awareness zero
Review 3:
 that particular part of the debate is especially funny macron was explaining he did not think fn voters were evil and that from where he comes from he knew many and she was like ooooh the evil fn invaders they are everywhere  self awareness zero
Review 4:
 france is pretty lively with it is own lingo that is usually deliberately bad translations shitpost is cacapoteau or compost crosspost is croixpoteau etc there is a mix of memes politics international news personal pro

In [61]:
# path = '/Users/shafz/OneDrive/Documents/deep-learning-final-project-yelp_reviews_classification/data/interim'
# tweets.to_csv(path + './og_reddit_cleaned.csv', index=False)

Removing stop words, lemmatizing, and tokenizing 


In [77]:
path = '/Users/shafz/OneDrive/Documents/deep-learning-final-project-yelp_reviews_classification/data/interim'
tweets = pd.read_csv(path + './og_reddit_cleaned.csv')

In [81]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

In [82]:
print(nlp.Defaults.stop_words)
len(nlp.Defaults.stop_words)

{'please', 'we', 'some', 'say', 'thereupon', 'hereby', 'never', 'toward', 'ever', 'nine', 'know', 'want', 'same', 'hence', '’ll', 'where', 'us', 'our', 'might', 'all', 'elsewhere', 'moreover', 'before', 'you', 'out', 'except', 'anywhere', 'either', 'thence', 'forty', 'under', 'until', 'side', 'she', 'must', 'becomes', '’s', 'nowhere', 'whereby', 'being', 'and', 'call', 'he', 'think', 'whereas', 'if', 'go', 'every', 'herein', 'is', 'sixty', 'sometime', 'by', 'yours', 'already', 'upon', 'his', 'mine', 'various', 'nor', 'on', "'ve", 'although', 'show', 'whither', 'still', 'yourselves', 'hereupon', 'wherever', 'people', 'then', 'bottom', 'several', '‘re', 'across', 'ours', 'however', 'your', 'see', 'not', 'latter', 'am', 'around', 'least', 'six', 'twenty', 'empty', 'because', 'after', 'here', 'take', 'why', 'becoming', 'ourselves', 'than', 'noone', 'further', 'anything', 'how', 'between', 'latterly', 'or', 'unless', 'put', 'them', 'former', 'four', 'of', 'another', 'onto', 'seems', 'have',

334

In [83]:
nlp.Defaults.stop_words -= {'not', 'always'}
nlp.Defaults.stop_words |= {'go', 'come', 'not', 'like', 'thing', 'people', 'think', 'want', 'tell', 'want', 'need', 'know'}
len(nlp.Defaults.stop_words)

334

In [84]:
texts = tweets['cleaned'].tolist()
lemmatized_texts = []
for doc in nlp.pipe(texts, batch_size=1000, n_process=4):
    lemmatized_texts.append(' '.join([token.lemma_ for token in doc if (token.is_stop==False)]))

In [85]:
tweets['lemmatized'] = lemmatized_texts

In [86]:
tweets.isna().sum()

text           0
subreddit      0
bucket         0
annotator      0
annotation     0
confidence    42
cleaned        0
lemmatized     0
dtype: int64

In [87]:
tweets.head(10)

Unnamed: 0,text,subreddit,bucket,annotator,annotation,confidence,cleaned,lemmatized
0,That particular part of the debate is especial...,europe,French politics,annotator03,Non-Moral,Confident,that particular part of the debate is especial...,particular debate especially funny macron expl...
1,That particular part of the debate is especial...,europe,French politics,annotator01,Purity,Confident,that particular part of the debate is especial...,particular debate especially funny macron expl...
2,That particular part of the debate is especial...,europe,French politics,annotator02,Thin Morality,Confident,that particular part of the debate is especial...,particular debate especially funny macron expl...
3,"/r/france is pretty lively, with it's own ling...",europe,French politics,annotator03,Non-Moral,Confident,france is pretty lively with it is own lingo t...,france pretty lively lingo usually deliberatel...
4,"/r/france is pretty lively, with it's own ling...",europe,French politics,annotator00,Non-Moral,Somewhat Confident,france is pretty lively with it is own lingo t...,france pretty lively lingo usually deliberatel...
5,"/r/france is pretty lively, with it's own ling...",europe,French politics,annotator02,Non-Moral,Confident,france is pretty lively with it is own lingo t...,france pretty lively lingo usually deliberatel...
6,TBH Marion Le Pen would be better. Closet fasc...,neoliberal,French politics,annotator03,Non-Moral,Somewhat Confident,tbh marion le pen would be better closet fasci...,tbh marion le pen well closet fascist vs flamb...
7,TBH Marion Le Pen would be better. Closet fasc...,neoliberal,French politics,annotator00,Thin Morality,Not Confident,tbh marion le pen would be better closet fasci...,tbh marion le pen well closet fascist vs flamb...
8,TBH Marion Le Pen would be better. Closet fasc...,neoliberal,French politics,annotator02,Equality,Somewhat Confident,tbh marion le pen would be better closet fasci...,tbh marion le pen well closet fascist vs flamb...
9,it really is a very unusual situation isn't it...,europe,French politics,annotator03,Non-Moral,Confident,it really is a very unusual situation is not i...,unusual situation fillon affair influence vote...


In [88]:
tweets.drop('cleaned', axis = 1, inplace=True)

In [89]:
tweets

Unnamed: 0,text,subreddit,bucket,annotator,annotation,confidence,lemmatized
0,That particular part of the debate is especial...,europe,French politics,annotator03,Non-Moral,Confident,particular debate especially funny macron expl...
1,That particular part of the debate is especial...,europe,French politics,annotator01,Purity,Confident,particular debate especially funny macron expl...
2,That particular part of the debate is especial...,europe,French politics,annotator02,Thin Morality,Confident,particular debate especially funny macron expl...
3,"/r/france is pretty lively, with it's own ling...",europe,French politics,annotator03,Non-Moral,Confident,france pretty lively lingo usually deliberatel...
4,"/r/france is pretty lively, with it's own ling...",europe,French politics,annotator00,Non-Moral,Somewhat Confident,france pretty lively lingo usually deliberatel...
...,...,...,...,...,...,...,...
61221,Well I can discern from your vehemence toward ...,AmItheAsshole,Everyday Morality,annotator05,Equality,Confident,discern vehemence woman want think equality hu...
61222,Kick! Punch! It's all in the mind. If you wann...,nostalgia,Everyday Morality,annotator05,Thin Morality,Somewhat Confident,kick punch mind wanna test sure find thing tea...
61223,Reddit can’t help you this is some seriously t...,confession,Everyday Morality,annotator05,Thin Morality,Confident,reddit not help seriously traumatic shit coast...
61224,Yes. Disordered eating is insidious. And Rita ...,AmItheAsshole,Everyday Morality,annotator05,Non-Moral,Somewhat Confident,yes disorder eat insidious rita need help sort...


In [90]:
path = '/Users/shafz/OneDrive/Documents/deep-learning-final-project-yelp_reviews_classification/data/processed/'
tweets.to_csv(path + 'lemmatized_reddit_og.csv')