In [2]:
import nltk
import pandas as pd
from nltk.tokenize import TweetTokenizer
import random
# http://www.nltk.org/book/ch06.html
# http://www.nltk.org/api/nltk.tokenize.html
# http://thinknook.com/twitter-sentiment-analysis-training-corpus-dataset-2012-09-22/
# http://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/

In [3]:
tweets_df = pd.read_csv('Sentiment-Analysis-Dataset.csv', error_bad_lines=False)
print(tweets_df.shape[0])


b'Skipping line 8836: expected 4 fields, saw 5\n'
b'Skipping line 535882: expected 4 fields, saw 7\n'


1578612


In [4]:
# store positive and negative tweets
tweet_negative = []
tweet_positive = []

# separate the negative and positive tweets
tweet_negative = tweets_df.loc[tweets_df['Sentiment'] == 0]
tweet_positive = tweets_df.loc[tweets_df['Sentiment'] == 1]


tweet_negative['Sentiment'].replace(0, 'negative',inplace=True)
tweet_positive['Sentiment'].replace(1, 'positive',inplace=True)

# only keep tweets and their label as a list
df_negative = list(zip(tweet_negative['SentimentText'], tweet_negative['Sentiment']))
df_positive = list(zip(tweet_positive['SentimentText'], tweet_positive['Sentiment']))
        
        
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [5]:
# create a single list of tuples each containing two elements
# First element is an array containing the words and the second element is the type of sentimenet
# We get rid of the words smaller than 2 characaters and we use lowercase for everything

# tokenize the tweets
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

tweets = []
for (words, sentiment) in df_negative + df_positive:
    words = tknzr.tokenize(words)
    words_filtered = []
    # add some rules to better tokenize
    for e in words:
        e = e.lower()
        if ':' in e:
            e = e.replace(':', '')
        if '#' in e:
            e = e.replace('#', '')
        if '!' in e:
            e = e.replace('!', '')
        if '?' in e:
            e = e.replace('?', '')
        if ';' in e:
            e = e.replace(';', '')
        if '.' in e:
            e = e.replace('.', '')
        if ',' in e:
            e = e.replace(',', '')
        if '@' in e:
            e = e.replace('@', '')
        if len(e) >= 3:
            words_filtered.append(e)
    tweets.append((words_filtered, sentiment))
        
        
    #words_filtered = [e.lower() for e in words.split() if len(e) >= 3 and ':' not in e and '@' not in e and '!' not in e  and not '?' in e and not '.' in e and not ';' in e and not ',']
    #tweets.append((words_filtered, sentiment))
    
# shuffle the tweets dataset
random.shuffle(tweets)


In [6]:
a = '@remy: This is #waaaaayyyy too much for you!!!!!!'
a = tknzr.tokenize(a)
print(a)

[':', 'This', 'is', '#waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']


In [7]:
# training and test set
# we keep 90% for the training set and 10% for the test set
limit_set = int(tweets_df.shape[0]/10)
test_set = tweets[:1000]
training_set = tweets[1000:10000]


In [8]:
#test_set

In [9]:
# Classifier

# the list of words features need to be extracted from the tweets. 
# it is a list with every distinct words ordered by frequency of appearance

def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
        all_words.extend(words)
    return all_words

def get_word_feature(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.most_common()
    # split the list of tuples to keep the words
    word_features, freq = map(list,zip(*word_features))
    return word_features

word_features = get_word_feature(get_words_in_tweets(tweets))
print(word_features[:50])



['the', 'and', 'you', 'for', 'have', 'that', 'but', "i'm", 'just', 'with', 'not', 'was', 'this', 'now', 'good', 'day', 'get', 'all', 'out', 'like', 'are', 'today', "it's", 'too', 'your', 'love', 'going', 'work', 'got', 'lol', 'time', 'back', 'from', 'what', 'one', 'will', 'know', 'about', 'really', "don't", 'had', 'can', 'see', 'some', "can't", 'its', 'still', 'night', 'well', 'new']


In [10]:
# feature extractor
# dictionnary indicating what words are contained in the input passed
def extract_features(document):
    document_words = set(document)
    features = {}
    count = 0
    for word in word_features:
        # cerate our dictionnary of 3000 words
        if count == 3000:
            break
        
        features['contain(%s)' %word] = (word in document_words)
        count = count + 1
    return features

In [11]:
len(word_features)

385038

In [12]:
# apply feature to our classifier 
training_set = nltk.classify.apply_features(extract_features, training_set)
test_set = nltk.classify.apply_features(extract_features, test_set)
print('done')

done


In [13]:
#print(test_set)

In [14]:
# train our classifier
classifier = nltk.NaiveBayesClassifier.train(training_set)
print('done')

done


In [15]:
print(nltk.classify.accuracy(classifier, test_set))

0.732


In [16]:
#print(test_set)

In [18]:
# saracasm dataset header
columns = ['index', 'tweets']
# read the dataframe
saracasm_df = pd.read_csv('sarcasticTweets.csv', names = columns, error_bad_lines=False)
# get rid of NaN value
saracasm_df = saracasm_df.dropna(how='any') 
print(saracasm_df.shape[0])
#saracasm_df

802


In [19]:
# sarcastic tester
#for tweet in saracasm_df.tweets:
#    print(tweet)
#sarcastic_set = nltk.classify.apply_features(extract_features, sarcastic_tweet)

In [20]:
# tokenize the tweets
tweets_sarcastic = []
for tweet_s in saracasm_df['tweets']:
    tweet_s = tknzr.tokenize(tweet_s)
    words_filtered = []
    for e in tweet_s:
        e = e.lower()
        if ':' in e:
            e = e.replace(':', '')
        if '#' in e:
            e = e.replace('#', '')
        if '!' in e:
            e = e.replace('!', '')
        if '?' in e:
            e = e.replace('?', '')
        if ';' in e:
            e = e.replace(';', '')
        if '.' in e:
            e = e.replace('.', '')
        if ',' in e:
            e = e.replace(',', '')
        if '@' in e:
            e = e.replace('@', '')
        if len(e) >= 3:
            words_filtered.append(e)
    tweets_sarcastic.append(words_filtered)
#print(tweets_sarcastic)

In [21]:
# sarcastic sentiment tester
index = 0
for original_tweet in saracasm_df.tweets:
    # original tweet
    print(original_tweet)
    # tokenized tweet
    print(tweets_sarcastic[index])
    print(' ')
    # prediction
    print(classifier.classify(extract_features(tweets_sarcastic[index])))
    index = index + 1
    print(' ')
#test_set = nltk.classify.apply_features(extract_features, test_set)

Very low entries in this giveaway! Hop over and check it out! http://t.co/OrUSN9ne

['very', 'low', 'entries', 'this', 'giveaway', 'hop', 'over', 'and', 'check', 'out', 'http//tco/orusn9ne']
 
positive
 
Summers over and it was shit. Only sunny day was a bbq in north west with @Kade_96 and @Jess_birch95 hahhahaaa #amazing #sarcasm

['summers', 'over', 'and', 'was', 'shit', 'only', 'sunny', 'day', 'was', 'bbq', 'north', 'west', 'with', 'and', 'hahhahaaa', 'amazing', 'sarcasm']
 
positive
 
Did you know that one of Bolshoi Ballet members was born in Maranhão? Bruna Gaglianone is part of the Moscow group! http://t.co/LG3SME1l

['did', 'you', 'know', 'that', 'one', 'bolshoi', 'ballet', 'members', 'was', 'born', 'maranhão', 'bruna', 'gaglianone', 'part', 'the', 'moscow', 'group', 'http//tco/lg3sme1l']
 
positive
 
I just love missing the bus! ☺ #sarcasm

['just', 'love', 'missing', 'the', 'bus', 'sarcasm']
 
negative
 
If you're going into highschool and cant do at least 4 pirouettes you ob

positive
 
@ESPN_BMX Pres. Obama should have said "freestyle? Is that like dressage on a bike?"

['pres', 'obama', 'should', 'have', 'said', 'freestyle', 'that', 'like', 'dressage', 'bike']
 
positive
 
@GFBguy @TonyMichael @ServiTechLabs @rictownsend @Bpowatch Thank you for following MobileWorxs :)

['thank', 'you', 'for', 'following', 'mobileworxs']
 
positive
 
“@ItsGillLikeJill: I couldn't tell you the key to success, but the key to failure is trying to please everyone.”

["couldn't", 'tell', 'you', 'the', 'key', 'success', 'but', 'the', 'key', 'failure', 'trying', 'please', 'everyone']
 
negative
 
I love when my retainers are super tight and make me talk so attractively! #NOT #sarcasm #majorlisp

['love', 'when', 'retainers', 'are', 'super', 'tight', 'and', 'make', 'talk', 'attractively', 'not', 'sarcasm', 'majorlisp']
 
positive
 
Jersey shore us canceled? #SoSad #Sarcasm

['jersey', 'shore', 'canceled', 'sosad', 'sarcasm']
 
positive
 
I'm incredibly annoyed that I'm gonna have

 
I think #HootSuite is pretty hoot. Er... hot.  http://t.co/afj0hXhw

['think', 'hootsuite', 'pretty', 'hoot', 'hot', 'http//tco/afj0hxhw']
 
positive
 
6 am lift + conditioning #GottaLoveEm #Sarcasm

['lift', 'conditioning', 'gottaloveem', 'sarcasm']
 
positive
 
@sarah_pleat I'm super excited for tomorrow morning. #sarcasm

["i'm", 'super', 'excited', 'for', 'tomorrow', 'morning', 'sarcasm']
 
positive
 
@mchalmers15 Cops would never do anything illegal or wrong though #Sarcasm

['cops', 'would', 'never', 'anything', 'illegal', 'wrong', 'though', 'sarcasm']
 
negative
 
chill night, good laughs.. very much needed. breaking night watching a movie w/ my loves.. Goodnight tweethearts :-*

['chill', 'night', 'good', 'laughs', 'very', 'much', 'needed', 'breaking', 'night', 'watching', 'movie', 'loves', 'goodnight', 'tweethearts']
 
positive
 
Bring it on exams!!! #onmondaypa:) hehe

['bring', 'exams', 'onmondaypa', 'hehe']
 
positive
 
I get distracted way to easily

['get', 'distracted'


['even', 'more', 'breaking', 'shocking', 'news', 'chelsea', 'and', 'barcelona', 'have', 'agreed', 'fee', 'for', 'lionel', 'messi', 'more', 'follow', 'sarcasm', 'itkagent']
 
positive
 
@DesireeShultz exactly! 15 acting like she is...man she's got a lot ahead of her in life! Hah! #sarcasm

['exactly', 'acting', 'like', 'she', 'man', "she's", 'got', 'lot', 'ahead', 'her', 'life', 'hah', 'sarcasm']
 
positive
 
@iTeeRoy ayeeeeeeee jamming to @Power99Philly leggoooo

['ayeee', 'jamming', 'leggooo']
 
positive
 
Super excited to run 17 miles at practice today.... #sarcasm

['super', 'excited', 'run', 'miles', 'practice', 'today', 'sarcasm']
 
negative
 
@EdRunNFootDoc thanks for downloading it!

['thanks', 'for', 'downloading']
 
positive
 
I love that Joey Barton quote “For some reason the Manager dislikes me”. Unbelievable. How can anyone dislike Joey Barton?? #SARCASM

['love', 'that', 'joey', 'barton', 'quote', 'for', 'some', 'reason', 'the', 'manager', 'dislikes', 'unbelievable', 'how

 
positive
 
@BillBaar  Poll could have been  by Senator Dillard. His supporters seem to have the most conflict with the Herald poll results yesterday.

['poll', 'could', 'have', 'been', 'senator', 'dillard', 'his', 'supporters', 'seem', 'have', 'the', 'most', 'conflict', 'with', 'the', 'herald', 'poll', 'results', 'yesterday']
 
negative
 
30k dead, maybe they're targeting civilians #sarcasm @hrw: 10 bakery attacks in #Aleppo are not (cont) http://t.co/H5GdgMy8

['30k', 'dead', 'maybe', "they're", 'targeting', 'civilians', 'sarcasm', 'bakery', 'attacks', 'aleppo', 'are', 'not', 'cont', 'http//tco/h5gdgmy8']
 
negative
 
@Just_kickinit way to have my back doo doo!!!! (:

['way', 'have', 'back', 'doo', 'doo']
 
negative
 
@Kirse10 on the phone "it can work, just look, Lamar and Chloe Kardashian have been together for 3 years now"

['the', 'phone', 'can', 'work', 'just', 'look', 'lamar', 'and', 'chloe', 'kardashian', 'have', 'been', 'together', 'for', 'years', 'now']
 
positive
 
Welp...

 
too many derps taken of me today. fk im beautiful #sarcasm.

['too', 'many', 'derps', 'taken', 'today', 'beautiful', 'sarcasm']
 
positive
 
If I ever need a brain transplant I'd choose yours because I'd want a brain that had never been used #sarcasm http://t.co/Jw1hUMC5

['ever', 'need', 'brain', 'transplant', "i'd", 'choose', 'yours', 'because', "i'd", 'want', 'brain', 'that', 'had', 'never', 'been', 'used', 'sarcasm', 'http//tco/jw1humc5']
 
negative
 
Off to soccer practice, could hell week be any more fun #sarcasm

['off', 'soccer', 'practice', 'could', 'hell', 'week', 'any', 'more', 'fun', 'sarcasm']
 
negative
 
These holidays have got so much better since the start... #Sarcasm -_-

['these', 'holidays', 'have', 'got', 'much', 'better', 'since', 'the', 'start', 'sarcasm']
 
negative
 
Football and hockey are the only two things I'm looking forward to this school year 💙💛

['football', 'and', 'hockey', 'are', 'the', 'only', 'two', 'things', "i'm", 'looking', 'forward', 'this', '

 
@_DanEwing hope you enjoy it!

['hope', 'you', 'enjoy']
 
positive
 
Nothing pisses me off more then fake people. If you don't like someone, don't act like you do #AreYouSerious #LikeReally #Sarcasm

['nothing', 'pisses', 'off', 'more', 'then', 'fake', 'people', 'you', "don't", 'like', 'someone', "don't", 'act', 'like', 'you', 'areyouserious', 'likereally', 'sarcasm']
 
negative
 
@thedevilwolf Dude, where the hell have you been?  He mentions it, like, every time he...oh, wait, #sarcasm.  Never mind.

['dude', 'where', 'the', 'hell', 'have', 'you', 'been', 'mentions', 'like', 'every', 'time', 'wait', 'sarcasm', 'never', 'mind']
 
negative
 
Don't be so formal because I'm much older you think it's not nice to ask me help.. Pls do tell me so I won't be feeling much guilt.

["don't", 'formal', 'because', "i'm", 'much', 'older', 'you', 'think', "it's", 'not', 'nice', 'ask', 'help', 'pls', 'tell', "won't", 'feeling', 'much', 'guilt']
 
negative
 
Tell it to my 9 bitch.

['tell', 'bitch']



["it's", 'extremely', 'difficult', 'hold', 'back', 'the', 'flow', 'feelings', 'now', 'more', 'for', 'pisces', 'http//tco/x620uejg']
 
negative
 
No really. I can sleep quite well while I'm upset. #sarcasm #gunnabealongnight

['really', 'can', 'sleep', 'quite', 'well', 'while', "i'm", 'upset', 'sarcasm', 'gunnabealongnight']
 
negative
 
Made it to my 8am! Professor didn't. #awesome #sarcasm

['made', '8am', 'professor', "didn't", 'awesome', 'sarcasm']
 
positive
 
Unexpected finding shows climate change complexities in soil http://t.co/Gco9cVof

['unexpected', 'finding', 'shows', 'climate', 'change', 'complexities', 'soil', 'http//tco/gco9cvof']
 
negative
 
Could this mean the end of @thecooleyzone 's pottery barn?  Say it ain't so! #sarcasm

['could', 'this', 'mean', 'the', 'end', 'pottery', 'barn', 'say', "ain't", 'sarcasm']
 
negative
 
Jake is pretty picky about his pears... but don't worry, there are a few left for you at Cottin's Hardware... http://t.co/RESgWDnA

['jake', 'pret

In [22]:
# saracasm detector 
# first step select two datasets of each 802 sarcastic and non sarcastic tweets
# our hypothesis is that there are no sarcastics tweets in the "normal" dataset

# store positive and negative tweets
tweet_negative = []
tweet_positive = []

# separate the negative and positive tweets
tweet_negative = tweets_df.loc[tweets_df['Sentiment'] == 0]
tweet_positive = tweets_df.loc[tweets_df['Sentiment'] == 1]

# negative label to saracasm
tweet_negative['Sentiment'].replace(0, 'negative',inplace=True)
tweet_positive['Sentiment'].replace(1, 'negative',inplace=True)

# only keep tweets and their label as a list
df_negative = list(zip(tweet_negative['SentimentText'], tweet_negative['Sentiment']))
df_positive = list(zip(tweet_positive['SentimentText'], tweet_positive['Sentiment']))

# non sarcastic dataframe
non_sarcastic_df = df_negative[:401] + df_positive[:401]

positive = []
for i in range(802):
    positive.append('positive')

# sarcastic dataframe
sarcastic_df = list(zip(saracasm_df.tweets, positive))

#print(sarcastic_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [30]:
# we create a global dataset with the tokenized tweets
# tokenize the tweets

tweets = []
for (words, sentiment) in sarcastic_df + non_sarcastic_df:
    words = tknzr.tokenize(words)
    words_filtered = []
    for e in words:
        e = e.lower()
        if ':' in e:
            e = e.replace(':', '')
        if '#' in e:
            e = e.replace('#', '')
        if '!' in e:
            e = e.replace('!', '')
        if '?' in e:
            e = e.replace('?', '')
        if ';' in e:
            e = e.replace(';', '')
        if '.' in e:
            e = e.replace('.', '')
        if ',' in e:
            e = e.replace(',', '')
        if '@' in e:
            e = e.replace('@', '')
        if len(e) >= 3:
            words_filtered.append(e)
    tweets.append((words_filtered, sentiment))
        
        
    #words_filtered = [e.lower() for e in words.split() if len(e) >= 3 and ':' not in e and '@' not in e and '!' not in e  and not '?' in e and not '.' in e and not ';' in e and not ',']
    #tweets.append((words_filtered, sentiment))
    
# shuffle the tweets dataset
random.shuffle(tweets)

In [31]:
# training and test set
# we keep 90% for the training set and 10% for the test set
limit_set = int(len(tweets)/10)
test_set = tweets[:limit_set]
training_set = tweets[limit_set:]
print(len(training_set))

1444


In [32]:

# testing the predictor
true_positive = 0
false_positive = 0
true_negative = 0
false_negative = 0
total_tweet = 0
for tweet, sentiment in test_set:
    total_tweet = total_tweet + 1
    count = 0
    words = []
    for word in tweet:
        words.append(word)
        if count == int(len(tweet)/2): 
            first = classifier.classify(extract_features(words))
            words = []
        if count == int(len(tweet)-1):
            second = classifier.classify(extract_features(words))
            words = []
        count = count + 1
    print(tweet)
    print(first != second)
    if first != second:
        if sentiment == 'positive':
            true_positive = true_positive + 1
        else:
            false_positive = false_positive + 1
    else:
        if sentiment == 'positive':
            false_negative = false_negative + 1
        else:
            true_negative = true_negative + 1
    print("sarcasm", sentiment == 'positive')
        
print("accuracy = ", (true_positive + true_negative)/(true_positive + true_negative + false_positive + false_negative))
print("precision = ", true_positive / (true_positive + false_positive))
print("recall = ", true_positive / (true_positive + false_negative))
    
        
        
    
    



#test_set[0]

['that', 'all']
False
sarcasm False
['belhblehbleh', 'the', 'familys', 'watching', 'moviee', 'and', 'yes', 'imm', 'beingg', 'computerr', 'nerd', 'and', 'wish', 'jerkface', 'would', 'wake', 'onee', 'daaayyy']
True
sarcasm False
['can', 'ride', 'bike', 'with', 'hands', 'your', 'impressive', 'sarcasm']
False
sarcasm True
['shocking', 'upset', 'rained', 'today', 'grove', 'city', 'college', 'sarcasm']
True
sarcasm True
['excited']
False
sarcasm False
['bothers', 'that', 'beliebers', 'are', 'saying', 'now', 'only', 'they', 'heard', "jb's", 'coming', 'did', 'not', 'see', 'his', 'tweet', '2months', 'ago', 'realfans', 'sarcasm']
True
sarcasm True
['yay', 'andrew', 'corbin', 'english', 'class', 'sarcasm']
False
sarcasm True
['another', 'early', 'day']
False
sarcasm False
["here's", 'our', 'strictly', 'abdominal', 'bench', 'innovation', 'fitness', 'equipment', 'http//tco/pqwnhh0r']
False
sarcasm True
['منكم', 'أحد', 'نفس', 'منفوسة', 'إلا', 'وقد', 'كتب', 'الله', 'مكانها', 'الجنة', 'والنار', 'وإلا'

sarcasm False
[]
False
sarcasm False
['love', 'you', "i'm", 'sure', 'will', 'sarcasm']
False
sarcasm True
['that', 'all']
False
sarcasm False
['very', 'sad', 'about', 'iran']
False
sarcasm False
['the', 'natalee', 'holloway', 'movie', 'makes', 'wanna', 'cry']
True
sarcasm True
['young', 'teke', 'miss', 'you', 'cuzin', 'has', 'been', 'year', 'sence', 'you', 'died', 'dont', 'ashamed', 'your', 'faith']
False
sarcasm False
['gotta', 'love', 'blipfm', 'john', 'mayer', 'trio', 'california', 'dreamin', 'heard', 'conan', '06/04', '2009', 'http//blipfm/~7qdf0']
False
sarcasm False
['omg', 'most', 'insane', 'matriculants', 'rather', 'play', 'soccer', 'than', 'study', 'prelims', 'soccer4lyf']
True
sarcasm True
['just', 'collected', 'twin', 'sisters', 'pension', 'fuckbitchesgetmoney']
False
sarcasm True
['hate', 'bring', 'cell', 'phone', 'with', 'when', 'the', 'bathroom', 'sarcasm']
True
sarcasm True
["it's", 'fun', 'when', 'only', 'understand', 'the', 'words', 'your', 'tweets', 'sarcasm']
False
s