In [95]:
import pandas as pd
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.corpus import twitter_samples

from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from nltk import FreqDist
from nltk import classify
from nltk import NaiveBayesClassifier

import re, string
import random
from emot.emo_unicode import *
import contractions as contractions

In [8]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

print(tweet_tokens)

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']


In [13]:
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
print(pos_tag(tweet_tokens[0]))

[('#FollowFriday', 'JJ'), ('@France_Inte', 'NNP'), ('@PKuchly57', 'NNP'), ('@Milipol_Paris', 'NNP'), ('for', 'IN'), ('being', 'VBG'), ('top', 'JJ'), ('engaged', 'VBN'), ('members', 'NNS'), ('in', 'IN'), ('my', 'PRP$'), ('community', 'NN'), ('this', 'DT'), ('week', 'NN'), (':)', 'NN')]


In [14]:
def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

print(lemmatize_sentence(tweet_tokens[0]))

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'be', 'top', 'engage', 'member', 'in', 'my', 'community', 'this', 'week', ':)']


In [15]:
def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [19]:
stop_words = stopwords.words('english')

In [20]:
remove_noise(tweet_tokens[0], stop_words)

['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']

In [21]:
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

In [22]:
print(positive_tweet_tokens[500])
print(positive_cleaned_tokens_list[500])

['Dang', 'that', 'is', 'some', 'rad', '@AbzuGame', '#fanart', '!', ':D', 'https://t.co/bI8k8tb9ht']
['dang', 'rad', '#fanart', ':d']


In [32]:
print(negative_tweet_tokens[500])
print(negative_cleaned_tokens_list[500])

['Can', 'u', 'feel', 'it', '?', ':(', '(:', '(', '#exo', 'http://t.co/ghsa262ORm']
['u', 'feel', ':(', '(:', '#exo']


In [92]:
positive_cleaned_tokens_list

[['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)'],
 ['hey',
  'james',
  'odd',
  ':/',
  'please',
  'call',
  'contact',
  'centre',
  '02392441234',
  'able',
  'assist',
  ':)',
  'many',
  'thanks'],
 ['listen', 'last', 'night', ':)', 'bleed', 'amazing', 'track', 'scotland'],
 ['congrats', ':)'],
 ['yeaaaah',
  'yippppy',
  'accnt',
  'verify',
  'rqst',
  'succeed',
  'get',
  'blue',
  'tick',
  'mark',
  'fb',
  'profile',
  ':)',
  '15',
  'day'],
 ['one', 'irresistible', ':)', '#flipkartfashionfriday'],
 ['like',
  'keep',
  'lovely',
  'customer',
  'wait',
  'long',
  'hope',
  'enjoy',
  'happy',
  'friday',
  'lwwf',
  ':)'],
 ['second',
  'thought',
  '’',
  'enough',
  'time',
  'dd',
  ':)',
  'new',
  'short',
  'enter',
  'system',
  'sheep',
  'must',
  'buy'],
 ['jgh', 'go', 'bayan', ':d', 'bye'],
 ['act',
  'mischievousness',
  'call',
  'etl',
  'layer',
  'in-house',
  'warehouse',
  'app',
  'katamari',
  'well',
  '…',
  'name',
  'imply

In [33]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

all_pos_words = get_all_words(positive_cleaned_tokens_list)

In [34]:
freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]


In [35]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

In [36]:
positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

print(positive_dataset)
print(negative_dataset)

[({'#followfriday': True, 'top': True, 'engage': True, 'member': True, 'community': True, 'week': True, ':)': True}, 'Positive'), ({'hey': True, 'james': True, 'odd': True, ':/': True, 'please': True, 'call': True, 'contact': True, 'centre': True, '02392441234': True, 'able': True, 'assist': True, ':)': True, 'many': True, 'thanks': True}, 'Positive'), ({'listen': True, 'last': True, 'night': True, ':)': True, 'bleed': True, 'amazing': True, 'track': True, 'scotland': True}, 'Positive'), ({'congrats': True, ':)': True}, 'Positive'), ({'yeaaaah': True, 'yippppy': True, 'accnt': True, 'verify': True, 'rqst': True, 'succeed': True, 'get': True, 'blue': True, 'tick': True, 'mark': True, 'fb': True, 'profile': True, ':)': True, '15': True, 'day': True}, 'Positive'), ({'one': True, 'irresistible': True, ':)': True, '#flipkartfashionfriday': True}, 'Positive'), ({'like': True, 'keep': True, 'lovely': True, 'customer': True, 'wait': True, 'long': True, 'hope': True, 'enjoy': True, 'happy': Tru

In [37]:
classifier = NaiveBayesClassifier.train(train_data)

In [38]:
print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.997
Most Informative Features
                      :) = True           Positi : Negati =   1677.3 : 1.0
                follower = True           Positi : Negati =     38.3 : 1.0
                     sad = True           Negati : Positi =     32.5 : 1.0
                     bam = True           Positi : Negati =     22.0 : 1.0
                    glad = True           Positi : Negati =     17.9 : 1.0
                  arrive = True           Positi : Negati =     17.7 : 1.0
                     x15 = True           Negati : Positi =     17.4 : 1.0
                 welcome = True           Positi : Negati =     14.9 : 1.0
                followed = True           Negati : Positi =     14.0 : 1.0
              appreciate = True           Positi : Negati =     13.9 : 1.0
None


In [135]:
filtered_tweets = pd.read_csv('Tweepy/snscrape.csv',lineterminator='\n')
filtered_tweets = pd.DataFrame(filtered_tweets)
filtered_tweets = filtered_tweets[['Text']]

In [136]:
filtered_tweets

Unnamed: 0,Text
0,@R_H_D__ @DanielTheHuman1 @StrewthQueen @Willi...
1,@idsharman Indeed. I'll let us take a teensy f...
2,@klucy239 @Mc3Mc336241251 @GOP actually its no...
3,#SINGAPORE: Government considering repeal of s...
4,"""On A Clear Day You Can See Forever"" #musical ..."
...,...
19995,the movie has lit rally like 0.0001% lgbt cont...
19996,@Julia20526177 @AubryAndrews LGBT allies
19997,As long as lgbt are hygenic and clean and not ...
19998,GOP-controlled states sue over LGBT-inclusive ...


In [137]:
cleaned_tweets = filtered_tweets['Text']
cleaned_tweets[0]

"@R_H_D__ @DanielTheHuman1 @StrewthQueen @WillisaOsburn It really isn't, but good to know you'll believe any homophobic thing someone on the internet has tweeted with no founding.\nIf it were actually LGBT people supporting it our government would actually do something about it.\n\nBut it's old men with money, so they get a pass."

In [138]:
texts= filtered_tweets['Text'].values.tolist()
texts

["@R_H_D__ @DanielTheHuman1 @StrewthQueen @WillisaOsburn It really isn't, but good to know you'll believe any homophobic thing someone on the internet has tweeted with no founding.\nIf it were actually LGBT people supporting it our government would actually do something about it.\n\nBut it's old men with money, so they get a pass.",
 "@idsharman Indeed. I'll let us take a teensy fraction of responsibility for some parts. Like being some people's first positive imagery of the LGBT community*, the mutual aid-ish parts**, and maybe a /touch/ of relationship drama.",
 '@klucy239 @Mc3Mc336241251 @GOP actually its not, not only have child drag shows been happening despite pushback with support from the Dems and leftists but they have also taken to defending people grooming kids. Saying that calling out groomers is anti-LGBT. Somehow they got it in their minds that those are =',
 '#SINGAPORE: Government considering repeal of sodomy law along with unspecified measures to ensure ban on #LGBT ma

In [139]:
def remove_contraction(text):
    # expanded_words = []

    # for word in text.split():
    #   # using contractions.fix to expand the shortened words
    # lower_text = text.lower()
    removed_contraction = contractions.fix(text)
    return removed_contraction

tweets_text_cleaned= [remove_contraction(text) for text in texts]
print(tweets_text_cleaned[:10])

['@R_H_D__ @DanielTheHuman1 @StrewthQueen @WillisaOsburn It really is not, but good to know you will believe any homophobic thing someone on the internet has tweeted with no founding.\nIf it were actually LGBT people supporting it our government would actually do something about it.\n\nBut it is old men with money, so they get a pass.', "@idsharman Indeed. I will let us take a teensy fraction of responsibility for some parts. Like being some people's first positive imagery of the LGBT community*, the mutual aid-ish parts**, and maybe a /touch/ of relationship drama.", '@klucy239 @Mc3Mc336241251 @GOP actually its not, not only have child drag shows been happening despite pushback with support from the Dems and leftists but they have also taken to defending people grooming kids. Saying that calling out groomers is anti-LGBT. Somehow they got it in their minds that those are =', '#SINGAPORE: Government considering repeal of sodomy law along with unspecified measures to ensure ban on #LGBT

In [140]:
def clean_tweet(tweet):
    '''
    Function to clean a tweet text by removing the links, punctuations, hashtags, and mentions symbols
    using regex expressions.
    Parameters:
       tweet (string): Tweet text to transform.
    Returns:
       tweet_cleaned (string): Tweet without links, punctuations, hashtags, and mentions symbols.
    """
    '''
    # convert the tweet text to lowercase letters
    tweet_lower = tweet.lower()

    #remove mentions
    remove_mentions = re.sub("@[A-Za-z0-9_]+","", tweet_lower)

    # remove links, punctuations, hashtags, and etc
    text = re.sub(u"(http\S+)|([“\"”])|([#|_*@()\`])", "", remove_mentions)

    # remove punctations
    text = re.sub(r" & ", " and ", text)
    text = re.sub(r"&", " ", text)
    text = re.sub(r"[,!?.\/]", " ", text)
    text = re.sub(r"=", " equal ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"\?", " ? ", text)
    text = re.sub(r"\+", " plus ", text)
    text = re.sub(r"\n", "", text)
    text = re.sub(r":", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r";", " ", text)

    return text

tweets_text_cleaned = [clean_tweet(text) for text in tweets_text_cleaned]
print(tweets_text_cleaned[:10])

['    it really is not  but good to know you will believe any homophobic thing someone on the internet has tweeted with no founding if it were actually lgbt people supporting it our government would actually do something about it but it is old men with money  so they get a pass ', ' indeed  i will let us take a teensy fraction of responsibility for some parts  like being some people s first positive imagery of the lgbt community  the mutual aid-ish parts  and maybe a  touch  of relationship drama ', '   actually its not  not only have child drag shows been happening despite pushback with support from the dems and leftists but they have also taken to defending people grooming kids  saying that calling out groomers is anti-lgbt  somehow they got it in their minds that those are  equal ', 'singapore  government considering repeal of sodomy law along with unspecified measures to ensure ban on lgbt marriage is protected from court challenge equalmarriage ', 'on a clear day you can see forev

In [141]:
# Function for converting emojis into word
def convert_emojis(text):
    for emot in UNICODE_EMOJI:
        text = text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace('$', '').replace(":","").split()))
    return text

tweets_text_cleaned = [convert_emojis(tweet) for tweet in tweets_text_cleaned]
print(tweets_text_cleaned[:10])

['    it really is not  but good to know you will believe any homophobic thing someone on the internet has tweeted with no founding if it were actually lgbt people supporting it our government would actually do something about it but it is old men with money  so they get a pass ', ' indeed  i will let us take a teensy fraction of responsibility for some parts  like being some people s first positive imagery of the lgbt community  the mutual aid-ish parts  and maybe a  touch  of relationship drama ', '   actually its not  not only have child drag shows been happening despite pushback with support from the dems and leftists but they have also taken to defending people grooming kids  saying that calling out groomers is anti-lgbt  somehow they got it in their minds that those are  equal ', 'singapore  government considering repeal of sodomy law along with unspecified measures to ensure ban on lgbt marriage is protected from court challenge equalmarriage ', 'on a clear day you can see forev

In [142]:
filtered_tweets = pd.DataFrame({'Cleaned_Text': tweets_text_cleaned})
filtered_tweets

Unnamed: 0,Cleaned_Text
0,it really is not but good to know you wil...
1,indeed i will let us take a teensy fraction ...
2,actually its not not only have child drag ...
3,singapore government considering repeal of so...
4,on a clear day you can see forever musical ope...
...,...
19995,the movie has lit rally like 0 0001% lgbt cont...
19996,lgbt allies
19997,as long as lgbt are hygenic and clean and not ...
19998,gop-controlled states sue over lgbt-inclusive ...


In [143]:
twt_tokens = [remove_noise(word_tokenize(twt)) for twt in cleaned_tweets]

In [144]:
print(classifier.classify(dict([token, True] for token in twt_tokens[0])))

Negative


In [145]:
def classify_tweets(twt_tokens):
    sentiments = []
    for twt_token in twt_tokens:
        s = classifier.classify(dict([token, True] for token in twt_token))
        sentiments.append(s)

    filtered_tweets['sentiment'] = sentiments
        
classify_tweets(twt_tokens)
filtered_tweets

Unnamed: 0,Cleaned_Text,sentiment
0,it really is not but good to know you wil...,Negative
1,indeed i will let us take a teensy fraction ...,Positive
2,actually its not not only have child drag ...,Negative
3,singapore government considering repeal of so...,Positive
4,on a clear day you can see forever musical ope...,Positive
...,...,...
19995,the movie has lit rally like 0 0001% lgbt cont...,Negative
19996,lgbt allies,Negative
19997,as long as lgbt are hygenic and clean and not ...,Negative
19998,gop-controlled states sue over lgbt-inclusive ...,Negative


In [147]:
filtered_tweets.head(20)

Unnamed: 0,Cleaned_Text,sentiment
0,it really is not but good to know you wil...,Negative
1,indeed i will let us take a teensy fraction ...,Positive
2,actually its not not only have child drag ...,Negative
3,singapore government considering repeal of so...,Positive
4,on a clear day you can see forever musical ope...,Positive
5,ikr discovering autistic twitter and especia...,Positive
6,indiana jones voice snakes why is it always s...,Positive
7,do you think there are more people pushing th...,Positive
8,appao lgbt transgender_symbol my one of my fav...,Positive
9,i am lgbt for stan marsh canon,Negative


In [148]:
# filtered_tweets.rename({'english-Only-Cleaned_Text\r': 'English_Only_Cleaned_Text'}, axis=1, inplace=True)
# filtered_tweets

In [149]:
filtered_tweets.to_csv('tweets_nltk_20k.csv')