In [1]:
# Thale Knudsen Kirkhorn and Anne Jacobsen Rike

In [2]:
import re
import nltk
import emoji
import string
import numpy as np
from pandas import read_csv
from nltk.corpus import stopwords
from collections import defaultdict
from sklearn.model_selection import train_test_split 

In [3]:
data=read_csv('Tweets.csv')
tweets = data['text']
labels = data['airline_sentiment']

In [4]:
def clean(tweets): 
    stop_words = set(stopwords.words('english'))
    
    # Updating list of stopwords to include some words without contractions 
    stop_words.update({'youre', 'youll', 'youd', 'hadnt', 'wouldnt'})
    
    def fix_tweet(tweet): 
        tweet = tweet.lower()
        tweet = tweet.translate(str.maketrans('', '', string.punctuation))
        tweet = re.sub("\d+", '', tweet)
    
        tweet = tweet.split()
        tweet = [word for word in tweet if word not in stop_words] 
        tweet = [word for word in tweet if not word.startswith('http')]
        return tweet
    tweets = tweets.apply(fix_tweet)
    return tweets

def split(tweets, labels):
    train_tweets, test_tweets, train_labels, test_labels = train_test_split(tweets, labels, test_size=0.25, random_state=42)
    return train_tweets, test_tweets, train_labels, test_labels

def build_vocab(train_tweets): 
    vocab = set()
    for tweet in train_tweets: 
        for word in tweet: 
            vocab.add(word)
    return vocab

In [5]:
tweets = clean(tweets)
train_tweets, test_tweets, train_labels, test_labels = split(tweets, labels)
vocab = build_vocab(train_tweets)

In [6]:
# Task 3. One might make use of some of the metadata fields present in the dataset, however we do not see this 
# as necessary in this assignment. If we were to use the metadata, we could have used the "negativereason", and 
# the "name" columns, as this could have improved the accurancy by seeing the context of a tweet. For example, 
# if a person usually writes a negative tweet, one can assume that their tweets usually tend to be negative. 
# As for the "negativereason", one could find out if for example someone writes a positive sarcastic tweet,  
# but tagging it as a "bad flight". 

In [7]:
class NaiveBayes: 
    def __init__(self): 
        self.n_negative = None
        self.n_neutral = None 
        self.n_positive = None
        self.n_total = None 
        self.negative_prior = None
        self.neutral_prior = None
        self.positive_prior = None 
        self.most_negative = None
        self.sentiment_classes = None
        self.counts = None
        self.total_counts = None
        self.likelihood = None
        self.probs = None
        self.prediction = None
    
    def train(self, train_tweets, train_labels, vocab):
        # Task 4, prior: 
        self.n_negative = train_labels.value_counts()[0]
        self.n_neutral = train_labels.value_counts()[1]
        self.n_positive = train_labels.value_counts()[2]
        self.n_total = self.n_negative + self.n_neutral + self.n_positive 
        self.negative_prior = np.log(self.n_negative / self.n_total)
        self.neutral_prior = np.log(self.n_neutral / self.n_total)
        self.positive_prior = np.log(self.n_positive / self.n_total)
        
        # Counts the number of times a word occurs in a text that is negative/neutral/positive 
        self.counts = {'negative':defaultdict(int), 'neutral':defaultdict(int), 'positive':defaultdict(int)}
        for tweet, label in zip(train_tweets, train_labels): 
            for word in tweet:
                self.counts[label][word] += 1
        
        # Counts the total amount of words in both positive and negative 
        self.sentiment_classes = ['negative', 'neutral', 'positive']
        self.total_counts = {'negative':0, 'neutral':0, 'positive':0} 
        for sentiment in self.sentiment_classes:
            for word in vocab: 
                self.total_counts[sentiment] += self.counts[sentiment][word] 
        
        # Calculates the likelihood (Task 8, Laplace smoothing)
        self.likelihood = {'negative':{}, 'neutral':{}, 'positive':{}} 
        for sentiment in self.sentiment_classes: 
            for word in vocab: 
                self.likelihood[sentiment][word] = np.log((self.counts[sentiment][word]+1) / (self.total_counts[sentiment]+len(vocab)))        
        
    def predict(self, tweet): 
        self.probs = {'negative':self.negative_prior, 'neutral':self.neutral_prior, 'positive':self.positive_prior}
        for sentiment in self.sentiment_classes:
            for word in tweet: 
                if word in vocab: 
                    self.probs[sentiment] += self.likelihood[sentiment][word]
        
        values = list(self.probs.values())
        best_index = values.index(max(values))
        return self.sentiment_classes[best_index]
    
    def explanation_generator(self, tweet):
        print('This text was predicted to be', self.predict(tweet))
        for word in tweet:
            if word not in vocab:
                continue
            predicted_class = None
            highest_value = -float('inf')
            for sentiment in self.sentiment_classes: 
                if self.likelihood[sentiment][word] > highest_value:
                    predicted_class = sentiment
                    highest_value = self.likelihood[sentiment][word]
            print(word, predicted_class)                
    
    def evaluate(self, test_tweets, test_labels): 
        # Test accuracy
        n_correct = 0 
        n_total_test = sum(test_labels.value_counts())
        i = 0
        for tweet, label in zip(test_tweets, test_labels): 
            prediction = self.predict(tweet)
    
            if prediction == label: 
                n_correct += 1 
                
        print(n_correct, n_total_test)
        return n_correct/n_total_test
    
    
    # Task 11, finding correctly and incorrectly predicted tweets  
    def find_wrongly_classified(self, test_tweets, test_labels): 
        # Test accuracy
        n_correct = 0 
        n_total_test = sum(test_labels.value_counts())
        for (ind, tweet), label in zip(test_tweets.items(), test_labels): 
            prediction = self.predict(tweet)
            if prediction == label: 
                n_correct += 1 
            else: 
                print(ind)
    
    def find_correctly_classified(self, test_tweets, test_labels): 
        # Test accuracy
        n_correct = 0 
        n_total_test = sum(test_labels.value_counts())
        for (ind, tweet), label in zip(test_tweets.items(), test_labels): 
            prediction = self.predict(tweet)
            if prediction == label: 
                print(ind) 
                

In [8]:
#print(data[tweet])

In [9]:
nb = NaiveBayes()
nb.train(train_tweets, train_labels, vocab)
accuracy = nb.evaluate(test_tweets, test_labels)
print(accuracy)

2857 3660
0.7806010928961749


In [10]:
# Task 9, command line utility

def fix_tweet(tweet): 
        tweet = tweet.lower()
        tweet = tweet.translate(str.maketrans('', '', string.punctuation))
        tweet = re.sub("\d+", '', tweet)
        
        tweet = tweet.split()
        tweet = [word for word in tweet if not word.startswith('http')]
        return tweet
    
text = input('Your text: ')
text = fix_tweet(text)

prediction = nb.predict(text)
print(f'Your text is: {prediction}')

Your text: it was a terrible flight
Your text is: negative


In [11]:
# Task 10, explanation generator
# For each word, prints the class with maximum likelihood 

nb.explanation_generator(test_tweets[6209])

This text was predicted to be positive
southwestair neutral
dont negative
conference positive
number neutral
spoke positive
texas positive
david neutral
finally positive
fixed positive
mine positive
hour negative


In [12]:
# Task 11, correctly and incorrectly predicted tweets  

#nb.find_wrongly_classified(test_tweets, test_labels)
#nb.find_correctly_classified(test_tweets, test_labels)

print(f'Wrongly classified: {test_tweets[2372], test_labels[2372]}')
print('Tweet 2372 predicted as: ' + nb.predict(test_tweets[2372]) + "\n")
# Tweet 2372 is probably predicted as positive because of positive words, such as "luckily", "disinfectant",
# and "welcome".

print(f'Wrongly classified: {test_tweets[6209], test_labels[6209]}')
print('Tweet 6209 predicted as: ' + nb.predict(test_tweets[6209]) + "\n")
# Tweet 6209 is probably predicted to be positive due to positive words, such as "finally" and "fixed". 

print(f'Correctly classified: {test_tweets[9924], test_labels[9924]}')
print('Tweet 9924 predicted as: ' + nb.predict(test_tweets[9924]) + "\n")
# Tweet 9924 is probably predicted to be negative due to negative words such as "wait", "long", and "away".

print(f'Correctly classified: {test_tweets[4664], test_labels[4664]}')
print('Tweet 4664 predicted as: ' + nb.predict(test_tweets[4664]))
# Tweet 4664 is probably predicted to be negative because of negative words such as "werent", 
# "cancelled" and "hours".

Wrongly classified: (['united', 'luckily', 'disinfectant', 'wipes', 'job', 'welcome', 'badservice'], 'negative')
Tweet 2372 predicted as: positive

Wrongly classified: (['southwestair', 'dont', 'conference', 'number', 'spoke', 'courtney', 'texas', 'david', 'finally', 'fixed', 'mine', 'hour'], 'negative')
Tweet 6209 predicted as: positive

Correctly classified: (['usairways', 'going', 'make', 'wait', 'phone', 'long', 'time', 'least', 'turn', 'ads', 'eating', 'away', 'soul'], 'negative')
Tweet 9924 predicted as: negative

Correctly classified: (['southwestair', 'werent', 'sincere', 'cancelled', 'flighted', 'flight', 'made', 'drive', 'hours', 'get', 'home'], 'negative')
Tweet 4664 predicted as: negative
