In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
import theano
pd.options.mode.chained_assignment = None

# Let us load our data file

In [2]:

full_df = pd.read_csv("train-balanced-sarcasm.csv")

In [3]:
df = full_df[["comment"]]
df["comment"] = df["comment"].astype(str) #declaring the contents in the col --
#--> "comment" as a string variable.

full_df.head() # lets see what our rows and cols look like. 

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


# In order for us to model language and predict sarcasm, we must first wrangle and clean our data.

### We will first start with lower casing our "text". This will treat the string 'text', 'Text', 'TEXT' homogeneously. 

In [4]:
df["text_lower"] = df["comment"].str.lower()
df.head()

Unnamed: 0,comment,text_lower
0,NC and NH.,nc and nh.
1,You do know west teams play against west teams...,you do know west teams play against west teams...
2,"They were underdogs earlier today, but since G...","they were underdogs earlier today, but since g..."
3,"This meme isn't funny none of the ""new york ni...","this meme isn't funny none of the ""new york ni..."
4,I could use one of those tools.,i could use one of those tools.


### Now we will remove the following symbols 
 #### --> string.punctuation includes #$%&\'()*+,-./:;<=>?@[\\]^_{|}~`" 

In [5]:
# drop the new column created in last cell
df.drop(["text_lower"], axis=1, inplace=True)

In [6]:
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(comment):
    """custom function to remove the punctuation"""
    return comment.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["text_wo_punct"] = df["comment"].apply(lambda comment: remove_punctuation(comment))
df.head()

Unnamed: 0,comment,text_wo_punct
0,NC and NH.,NC and NH
1,You do know west teams play against west teams...,You do know west teams play against west teams...
2,"They were underdogs earlier today, but since G...",They were underdogs earlier today but since Gr...
3,"This meme isn't funny none of the ""new york ni...",This meme isnt funny none of the new york nigg...
4,I could use one of those tools.,I could use one of those tools


### Now we will remove stop words like "the", "a", "I" to help with accuracy

In [7]:
from nltk.corpus import stopwords
nltk.download('stopwords')
", ".join(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/valazeinali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [8]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(comment):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(comment).split() if word not in STOPWORDS])

df["text_wo_stop"] = df["text_wo_punct"].apply(lambda comment: remove_stopwords(comment))
df.head()

Unnamed: 0,comment,text_wo_punct,text_wo_stop
0,NC and NH.,NC and NH,NC NH
1,You do know west teams play against west teams...,You do know west teams play against west teams...,You know west teams play west teams east teams...
2,"They were underdogs earlier today, but since G...",They were underdogs earlier today but since Gr...,They underdogs earlier today since Gronks anno...
3,"This meme isn't funny none of the ""new york ni...",This meme isnt funny none of the new york nigg...,This meme isnt funny none new york nigga ones
4,I could use one of those tools.,I could use one of those tools,I could use one tools


### Lets see what some frequent words are and remove them

In [9]:
import matplotlib.pyplot as plt
%matplotlib inline
%pylab inline

from collections import Counter
cnt = Counter()
for comment in df["text_wo_stop"].values:
    for word in comment.split(): # going through every word in every tweet
        cnt[word] += 1
cnt.most_common(10)
        
        #### Changes made by *VALA ####
        ## Lets make Histogram of most frequent words
    #y_pos = word
    #freq = cnt[word] #get the freq of the words
        
    #plt.bar(y_pos,freq, align = 'center', alpha = .5)
    #plt.xticks(y_pos, word)
    #plt.ylabel('Frequent Words')
    #plt.title('Most frequent words in our Twitter data set')
        
    #plt.show()      

Populating the interactive namespace from numpy and matplotlib


[('I', 169221),
 ('like', 53550),
 ('dont', 36745),
 ('people', 34167),
 ('would', 34048),
 ('Yeah', 33124),
 ('get', 32644),
 ('Im', 30655),
 ('one', 29323),
 ('But', 27309)]

In [10]:
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(comment):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(comment).split() if word not in FREQWORDS])

df["text_wo_stopfreq"] = df["text_wo_stop"].apply(lambda comment: remove_freqwords(comment))
df.head()

Unnamed: 0,comment,text_wo_punct,text_wo_stop,text_wo_stopfreq
0,NC and NH.,NC and NH,NC NH,NC NH
1,You do know west teams play against west teams...,You do know west teams play against west teams...,You know west teams play west teams east teams...,You know west teams play west teams east teams...
2,"They were underdogs earlier today, but since G...",They were underdogs earlier today but since Gr...,They underdogs earlier today since Gronks anno...,They underdogs earlier today since Gronks anno...
3,"This meme isn't funny none of the ""new york ni...",This meme isnt funny none of the new york nigg...,This meme isnt funny none new york nigga ones,This meme isnt funny none new york nigga ones
4,I could use one of those tools.,I could use one of those tools,I could use one tools,could use tools


## Now lets remove some rare words

In [11]:
# Drop the two columns which are no more needed 
df.drop(["text_wo_punct", "text_wo_stop"], axis=1, inplace=True)

n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(comment):
    """custom function to remove the rare words"""
    return " ".join([word for word in str(comment).split() if word not in RAREWORDS])

df["text_wo_stopfreqrare"] = df["text_wo_stopfreq"].apply(lambda comment: remove_rarewords(comment))
df.head()

Unnamed: 0,comment,text_wo_stopfreq,text_wo_stopfreqrare
0,NC and NH.,NC NH,NC NH
1,You do know west teams play against west teams...,You know west teams play west teams east teams...,You know west teams play west teams east teams...
2,"They were underdogs earlier today, but since G...",They underdogs earlier today since Gronks anno...,They underdogs earlier today since Gronks anno...
3,"This meme isn't funny none of the ""new york ni...",This meme isnt funny none new york nigga ones,This meme isnt funny none new york nigga ones
4,I could use one of those tools.,could use tools,could use tools


## Now lets stem our words

In [None]:
from nltk.stem.porter import PorterStemmer

#Drop the two columns 
#df.drop(["text_wo_stopfreq", "text_wo_stopfreqrare"], axis=1, inplace=True) 

stemmer = PorterStemmer()

def stem_words(comment):
    return " ".join([stemmer.stem(word) for word in comment.split()])

df["text_stemmed"] = df["comment"].apply(lambda comment: stem_words(comment))
df.head()

## We can see that words like private and propose have their e at the end chopped off due to stemming. This is not intented. What can we do fort hat? We can use Lemmatization in such cases.

## Also this porter stemmer is for English language. If we are working with other languages, we can use snowball stemmer. The supported languages for snowball stemmer are

In [None]:
from nltk.stem.snowball import SnowballStemmer
SnowballStemmer.languages ## we can use snowballstemmer 
#for all these languages, WOAH

## Lets use lemmatization to make sure the root word in preserved!

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
nltk.download('wordnet')


lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in comment.split()])

df["text_lemmatized"] = df["comment"].apply(lambda comment: lemmatize_words(comment))
df.head()

## We can see that the trailing e in the propose and private is retained when we use lemmatization unlike stemming.

In [None]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(comment):
    pos_tagged_text = nltk.pos_tag(comment.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

df["text_lemmatized"] = df["comment"].apply(lambda comment: lemmatize_words(comment))
df.head()

## Convert emoticons to english 
### Ex: :-) to Happy_face_smiley

In [None]:
def convert_emoticons(comment):
    for emot in EMOTICONS:
        comment = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), comment)
    return comment

## Convert emojis to words
### Ex:  🔥to fire

In [None]:
def convert_emojis(comment):
    for emot in UNICODE_EMO:
        comment = re.sub(r'('+emot+')', "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()), comment)
    return comment

## Let us remove URLs 

In [None]:
def remove_urls(comment):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', comment)

## Let us remove HTML tags

In [None]:
def remove_html(comment):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', comment)

# Now lets do some sarcasm detection

## First we will apply all the data cleaning to get a data frame that's reeady for analysis

In [None]:
#each dataframe update takes the previous data frame and updates it to 
#the current function action.

#lhs = new dataframe , rhs= old dataframe

#below the lhs is the text but all lowercase, the rhs is the 
#original tweets in the same format
full_df["text_lower"] = full_df["comment"].str.lower()

#remove punctuation 
full_df["text_wo_punct"] = full_df["text_lower"].apply(lambda comment: remove_punctuation(comment))

#remove rare words
full_df["text_wo_stopfreqrare"] = full_df["text_wo_punct"].apply(lambda comment: remove_rarewords(comment))

#remove frequent words
full_df["text_wo_stopfreq"] = full_df["text_wo_stopfreqrare"].apply(lambda comment: remove_freqwords(comment))

#lemmatize
full_df["text_lemmatized2"] = full_df["text_wo_stopfreq"].apply(lambda comment: lemmatize_words(comment))

## below we will convert emoticons like :-) to Happy_face_smiley and etc
#df["text_emotconsless"] = df["text_lemmatized2"].apply(lambda text: convert_emoticons(text))

## convert emojis to english text
#df["text_emojiless"] = df["text_emotconsless"].apply(lambda text: convert_emojis(text))

#remove URls
full_df["URL_gone"] = full_df["text_lemmatized2"].apply(lambda comment: remove_urls(comment))
                                           
#remove HTML tags
full_df["final_df"] = full_df["URL_gone"].apply(lambda comment: remove_urls(comment))


full_df.shape
full_df.head()


## Time for non data cleaning

In [None]:
full_df.dropna(subset=['final_df'], inplace=True)


In [None]:
full_df['label'].value_counts()
