In [0]:
import pandas as pd
import numpy as np

In [0]:
data = pd.read_csv("Tweets.csv")

In [191]:
data["airline_sentiment"]

0         neutral
1        positive
2         neutral
3        negative
4        negative
           ...   
14635    positive
14636    negative
14637     neutral
14638    negative
14639     neutral
Name: airline_sentiment, Length: 14640, dtype: object

In [192]:
data["text"]

0                      @VirginAmerica What @dhepburn said.
1        @VirginAmerica plus you've added commercials t...
2        @VirginAmerica I didn't today... Must mean I n...
3        @VirginAmerica it's really aggressive to blast...
4        @VirginAmerica and it's a really big bad thing...
                               ...                        
14635    @AmericanAir thank you we got on a different f...
14636    @AmericanAir leaving over 20 minutes Late Flig...
14637    @AmericanAir Please bring American Airlines to...
14638    @AmericanAir you have my money, you change my ...
14639    @AmericanAir we have 8 ppl so we need 2 know h...
Name: text, Length: 14640, dtype: object

In [0]:
sentiments = list(data["airline_sentiment"])
tweets = list(data["text"])

In [194]:
print(len(sentiments))
print(len(tweets))

14640
14640


**Splitting into train and test data**

In [0]:
from sklearn.model_selection import train_test_split
tweets_train,tweets_test,sentiments_train,sentiments_test = train_test_split(tweets,sentiments,random_state=1)

**Cleaning the text**


In [196]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import nltk
nltk.download('averaged_perceptron_tagger')
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
# extended stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')
import string
punctuations = list(string.punctuation)
stop = stop + punctuations
stop = stop + ['1','2','3','4','5','6','7','8','9','0']

In [0]:
# "not" might be an important word
stop.remove("not")

In [0]:
lemmatizer = WordNetLemmatizer()

In [0]:
def get_lemma_tag(tag):

    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [0]:
def clean_word(word):
  # find the postag of word
  tag = nltk.pos_tag(word.split(" "))[0][1]
  lemma_tag = get_lemma_tag(tag)
  lemmatized_word = lemmatizer.lemmatize(word,pos = lemma_tag)
  return lemmatized_word.lower()

In [0]:
# This function takes the input string and clean it and then return the clean string of text
def clean_tweet(tweet):
  clean_words = []
  words = word_tokenize(tweet)
  for word in words:
    if word.lower() not in stop:
      clean_words.append(clean_word(word))
  # we will have clean words in clean_words for given tweet
  # now converting clean_words to a string and return it
  clean_string = " ".join(clean_words)
  return clean_string

**Cleaning the training data**

In [203]:
clean_tweets_train = []
for current_tweet in tweets_train:
  clean_tweets_train.append(clean_tweet(current_tweet))
clean_tweets_train

['united change make hour something take second online not thrill loved agent though',
 'united engineer design 787 door frame extend half foot plane seat 27a force always sit',
 'americanair 11 11 delayed flight suck get bad',
 'united ouch not fair',
 'usairways stuck cae need reflight booking problems next flight possibly dm info',
 'americanair 8:30 departure sure catering strike jfk foxnews cnbc bloombergradio http //t.co/hpgxyzrw8o',
 'southwestair got help nice lady phone georgia thank',
 'united refund flight voucher another flight different airline expense today hour drive inconvenient',
 'usairways jack_kairys site clearly explains allow carry allow pit not bos make sense',
 'united thanks concern contact customer care upon return australia',
 'usairways well depend policy make determination airline select travel',
 'americanair lady b1 abq 5347 2/23 great service among several cancelled flightlations',
 'americanair considering purchase 21st attempt use 21st believe would fi

In [204]:
len(clean_tweets_train)

10980

**Cleaning the testing data**

In [205]:
clean_tweets_test = []
for current_tweet in tweets_test:
  clean_tweets_test.append(clean_tweet(current_tweet))
clean_tweets_test

["jetblue 'll pas along advice guy rock",
 "united sent dm file reference number.. want know someone locate bag even 's not yet",
 'southwestair black history commercial really sweet well do',
 'southwestair still baltimore delta lap around u laugh ridiculous',
 'southwestair sea den south sound volleyball team way http //t.co/tn5cxcld6m',
 'united one worker refuse give name reference note tone amp language unprofessional',
 'americanair seat assign inappropriate child age aa knew age child',
 "americanair change gate n't tell fuck wrong people learn fucken job",
 'usairways mess cause computer system flight 719 hour late flight gate u est 26 min wait',
 'united come airline 90+ flight last year make check carry-on not even gate check ... baggage claim',
 'usairways pretty ridiculous phx sky harbor employee work check sunday afternoon 30 min amp counting..',
 "united yes 1427 cancelled flightled moved 333 'll figure car rental change",
 "americanair big joke customerservice 've ever s

In [206]:
len(clean_tweets_test)

3660

**Using CountVectorizer to create feature set**

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
count_vectorizer = CountVectorizer(max_features = 9000,ngram_range = (1,2))
x_train = count_vectorizer.fit_transform(clean_tweets_train)
x_test = count_vectorizer.transform(clean_tweets_test)

In [0]:
x_train = x_train.todense()
x_test = x_test.todense()

**Using Multinomial Naive Bayes for classification**

In [0]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(x_train,sentiments_train)
y_pred = clf.predict(x_test)

**Testing Results**

In [211]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
print(accuracy_score(sentiments_test,y_pred))
print(classification_report(sentiments_test,y_pred))
print(confusion_matrix(sentiments_test,y_pred))

0.7759562841530054
              precision    recall  f1-score   support

    negative       0.82      0.89      0.86      2291
     neutral       0.61      0.49      0.55       774
    positive       0.75      0.69      0.72       595

    accuracy                           0.78      3660
   macro avg       0.73      0.69      0.71      3660
weighted avg       0.77      0.78      0.77      3660

[[2048  176   67]
 [ 323  381   70]
 [ 120   64  411]]
