Twitter US Airline Sentiment Analysis

Given Twitter US Airline Sentiment Dataset, which contains data for over 14000 tweets, your task is to predict the sentiment of the tweet i.e. positive, negative or neutral.
You are given:
1. A Training dataset csv file with X train and Y train data
2. A X test File and you have to predict and submit predictions for this file.

Read Instructions carefully -
1. Files are in csv format.
2. Submit a csv file with only predictions for X test data. File should not have any headers and should only have one column i.e. predictions. 
3. Submit your ipynb file as well.
4. Your score is based on number of accurate predictions.

In [1]:
import pandas as pd

train_data=pd.read_csv('_training_twitter_x_y_train.csv')
test_data=pd.read_csv('_test_twitter_x_test.csv')

In [2]:
train_data[:2]

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)


In [3]:
test_data[:2]

Unnamed: 0,tweet_id,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,569682010270101504,American,,zsalim03,,0,@AmericanAir In car gng to DFW. Pulled over 1h...,,2015-02-22 18:15:50 -0800,Texas,Central Time (US & Canada)
1,569608307184242688,American,,sa_craig,,0,"@AmericanAir after all, the plane didn’t land ...",,2015-02-22 13:22:57 -0800,"College Station, TX",Central Time (US & Canada)


In [4]:
sentiments=train_data['airline_sentiment'].values
training_data=train_data['text'].values
testing_data=test_data['text'].values

In [5]:
categories=list(set(sentiments))
categories

['negative', 'neutral', 'positive']

In [4]:
sentiments[sentiments=='positive']=1
sentiments[sentiments=='neagtive']=-1
sentiments[sentiments=='neutral']=0

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shwetakumari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
from nltk.corpus import stopwords
import string
stop_words=set(stopwords.words('english'))
punctuations=list(string.punctuation)
stop_words.update(punctuations)

In [8]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shwetakumari/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [10]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shwetakumari/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shwetakumari/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [12]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [13]:
#clean each file
def clean_words(sentences):
    words=[]
    for w in sentences:
        if w.lower() not in stop_words:
            pos=pos_tag([w])
            clean_word=lemmatizer.lemmatize(w,pos=get_simple_pos(pos[0][1]))    
            words.append(clean_word.lower())
    return words    

In [14]:
training_documents=[]
for i in range(len(sentiments)):
    training_documents.append((clean_words(word_tokenize(training_data[i])),sentiments[i]))

In [15]:
training_documents

[(['southwestair',
   'schedule',
   'morning',
   '2',
   'day',
   'fact',
   'yes..not',
   'sure',
   'even',
   'flight',
   'one',
   'cancelled',
   'flightled'],
  'negative'),
 (['southwestair',
   'see',
   'worker',
   'time',
   'time',
   'go',
   'beyond',
   'love',
   'fly',
   'guy',
   'thank'],
  'positive'),
 (['united',
   'flew',
   'ord',
   'miami',
   'back',
   'great',
   'crew',
   'service',
   'leg',
   'thanks'],
  'positive'),
 (['southwestair', 'dultch97', "'s", 'horse', 'radish', '😤🐴'], 'negative'),
 (['united',
   'flight',
   'ord',
   'delayed',
   'air',
   'force',
   'one',
   'last',
   'flight',
   'sbn',
   '8:20',
   '5',
   'min',
   'land'],
  'negative'),
 (['united',
   'load',
   'u',
   'fly',
   'sardine',
   'knew',
   'pilot',
   '2',
   'hour',
   'late',
   'flight',
   'incompetent',
   'beyond',
   'belief'],
  'negative'),
 (['jetblue',
   'stock',
   'response',
   'delays',
   'frustrate',
   'poor',
   'cust',
   'serv',
   '

In [16]:
testing_documents=[]
for i in range(len(testing_data)):
    testing_documents.append((clean_words(word_tokenize(testing_data[i]))))

# PREDICTIONS

In [17]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
train_doc=[" ".join(document) for document,category in training_documents]
categories=[category for document,category in training_documents]
test_doc=[" ".join(document) for document in testing_documents]

In [18]:
from nltk import TweetTokenizer
tokenizer = TweetTokenizer()

In [16]:
count_vec = CountVectorizer(max_features = 10000, max_df=0.25, tokenizer = tokenizer.tokenize, ngram_range=(1, 2))
x_train_cv=count_vec.fit_transform(train_doc).todense()
x_test_cv=count_vec.transform(test_doc).todense()

In [38]:
#SVM
from sklearn.decomposition import PCA
pca_=PCA()
pca_.fit(x_train_cv)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [19]:
pca_.explained_variance_

array([6.28865600e-01, 5.57838136e-01, 2.30021964e-01, ...,
       1.03231877e-33, 4.66161778e-34, 1.78455128e-34])

In [20]:
k = 0
total_var = sum(pca_.explained_variance_)
curr_var = 0
while curr_var / total_var < 0.90:
    k += 1
    curr_var += pca_.explained_variance_[k]
k

3498

In [21]:
#SVM
from sklearn.decomposition import PCA
pca = PCA(n_components = k, whiten = True)
X_train_pca = pca.fit_transform(x_train_cv)
X_test_pca = pca.transform(x_test_cv)
from sklearn.svm import SVC
svc = SVC(C = 1000)
svc.fit(X_train_pca, categories)
Y_pred_svm = svc.predict(X_test_pca)



In [21]:
tf_idf= TfidfVectorizer(max_features = 3000, tokenizer = tokenizer.tokenize, ngram_range=(1, 2))
x_train_cv=tf_idf.fit_transform(train_doc).todense()
x_test_cv=tf_idf.transform(test_doc).todense()

In [22]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 3000, whiten = True)
X_train_pca = pca.fit_transform(x_train_cv)
X_test_pca = pca.transform(x_test_cv)
from sklearn.svm import SVC
svc = SVC(C = 1000)
svc.fit(X_train_pca, categories)
Y_pred_svm = svc.predict(X_test_pca)



In [23]:
Y_pred_svm

array(['negative', 'negative', 'negative', ..., 'negative', 'negative',
       'negative'], dtype='<U8')

In [24]:
ans=pd.DataFrame(Y_pred_svm)
ans

Unnamed: 0,0
0,negative
1,negative
2,negative
3,negative
4,negative
...,...
3655,negative
3656,negative
3657,negative
3658,negative


In [25]:
ans.to_csv("twitter.csv", header=None, index=None)
#63.1 accuracy

In [27]:
categories[categories=='positive']=1
categories[categories=='neagtive']=-1
categories[categories=='neutral']=0

PREDICTION USING RANDOM FOREST CLASSIFIER

In [29]:
from sklearn.ensemble import RandomForestClassifier
clf_r=RandomForestClassifier(n_estimators=23,random_state=0,max_depth=160)
clf_r.fit(X_train_pca,categories)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=160, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=23,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [30]:
y_pred_r=clf_r.predict(X_test_pca)

In [32]:
ans2=pd.DataFrame(y_pred_r)
ans2.to_csv("twitter.csv", header=None, index=None)
#67.4 accuracy

In [3]:
from sklearn.naive_bayes import MultinomialNB as MNB
clf_mnb=MNB(alpha=1.2)
clf_mnb.fit(x_train_cv,categories)
y_pred_mnb=clf_mnb.predict(x_test_cv)
ans3=pd.DataFrame(y_pred_mnb)
ans3.to_csv("twitter.csv", header=None, index=None)
##75% accuracy