Given Twitter US Airline Sentiment Dataset, which contains data for over 14000 tweets, your task is to predict the sentiment of the tweet i.e. positive, negative or neutral.

In [1]:
import pandas as pd
import nltk
import numpy as np
from nltk import word_tokenize 
import re

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anisha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Anisha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anisha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
training_data=pd.read_csv('training_twitter_x_y_train.csv')
training_data["text"][0]

'@SouthwestAir I am scheduled for the morning, 2 days after the fact, yes..not sure why my evening flight was the only one Cancelled Flightled'

In [4]:
testing_data=pd.read_csv("test_twitter_x_test.csv")
testing_data["text"][0]

"@AmericanAir In car gng to DFW. Pulled over 1hr ago - very icy roads. On-hold with AA since 1hr. Can't reach arpt for AA2450. Wat 2 do?"

In [5]:
x=training_data["text"]
type(x)

pandas.core.series.Series

In [6]:
xtest=testing_data["text"]
xtest=np.array(xtest)
xtest=xtest.reshape(len(xtest),1)
xtest.shape

(3660, 1)

In [7]:
y=training_data['airline_sentiment']
x=np.array(x)
y=np.array(y)
x=x.reshape(len(x),1)
y=y.reshape(len(y),1)
x.shape,y.shape

((10980, 1), (10980, 1))

In [8]:
train=np.append(x,y,axis=1)
train.shape

(10980, 2)

In [9]:
test=xtest
test.shape

(3660, 1)

In [10]:
documents = []
c=0
for text,category in train:
    c+=1
    documents.append((word_tokenize(text), category))
documents[0:2]

[(['@',
   'SouthwestAir',
   'I',
   'am',
   'scheduled',
   'for',
   'the',
   'morning',
   ',',
   '2',
   'days',
   'after',
   'the',
   'fact',
   ',',
   'yes..not',
   'sure',
   'why',
   'my',
   'evening',
   'flight',
   'was',
   'the',
   'only',
   'one',
   'Cancelled',
   'Flightled'],
  'negative'),
 (['@',
   'SouthwestAir',
   'seeing',
   'your',
   'workers',
   'time',
   'in',
   'and',
   'time',
   'out',
   'going',
   'above',
   'and',
   'beyond',
   'is',
   'why',
   'I',
   'love',
   'flying',
   'with',
   'you',
   'guys',
   '.',
   'Thank',
   'you',
   '!'],
  'positive')]

In [11]:
test_documents = []
c=0
for text in test:
    c+=1
    test_documents.append(word_tokenize(str(text)))
test_documents[0:2]

[['[',
  '``',
  '@',
  'AmericanAir',
  'In',
  'car',
  'gng',
  'to',
  'DFW',
  '.',
  'Pulled',
  'over',
  '1hr',
  'ago',
  '-',
  'very',
  'icy',
  'roads',
  '.',
  'On-hold',
  'with',
  'AA',
  'since',
  '1hr',
  '.',
  'Ca',
  "n't",
  'reach',
  'arpt',
  'for',
  'AA2450',
  '.',
  'Wat',
  '2',
  'do',
  '?',
  "''",
  ']'],
 ['[',
  "'",
  '@',
  'AmericanAir',
  'after',
  'all',
  ',',
  'the',
  'plane',
  'didn',
  '’',
  't',
  'land',
  'in',
  'identical',
  'or',
  'worse',
  ')',
  'conditions',
  'at',
  'GRK',
  'according',
  'to',
  'METARs',
  '.',
  "'",
  ']']]

In [12]:
import random
random.shuffle(documents)
documents[0:2]

[(['@',
   'USAirways',
   'husband',
   'tried',
   'to',
   'use',
   'dividends',
   '&',
   'amp',
   ';',
   'companion',
   'fair',
   'for',
   'emergency',
   ',',
   'transferred',
   '&',
   'amp',
   ';',
   'put',
   'on',
   'hold',
   'for',
   'over',
   'two',
   'hours',
   'gave',
   'up',
   '#',
   'disappointed'],
  'negative'),
 (['@',
   'united-rebooked',
   'to',
   'OMA',
   '-',
   '180',
   'miles',
   'from',
   'my',
   'destination',
   '.',
   'Spotty',
   'customer',
   'service',
   '.',
   'I',
   'get',
   'staff',
   'stressors',
   'but',
   'come',
   'on',
   ',',
   'this',
   'is',
   'your',
   'business',
   '.'],
  'negative')]

In [13]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [14]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [15]:
from nltk import pos_tag
w = "better"
pos_tag([w])

[('better', 'RBR')]

In [16]:
from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [17]:
def clean_review(words):
    output_words = []
    #regex = re.compile('[a-zA-Z0-9_-]+$')
    regex = re.compile('[a-zA-Z_-]+$')
    for w in words:
        if w.lower() not in stops and len(w)>2 and not w.isnumeric() and re.match(regex,w):
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [18]:
documents = [(clean_review(document), category) for document, category in documents]

In [19]:
test_documents = [clean_review(document) for document in test_documents]

In [20]:
training_documents = documents
testing_documents = test_documents

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
categories = [category for document, category in training_documents]

In [23]:
#Joining All features in each tuple for Train Data
text_documents = [" ".join(document) for document, category in training_documents]

In [24]:
#Joining All features in each tuple for Test Data
test_text_documents = [" ".join(document) for document in testing_documents]

In [25]:
from sklearn.model_selection import train_test_split
#x_train, x_test, y_train, y_test = train_test_split(text_documents, categories)
#count_vec = CountVectorizer(max_features = 2000, ngram_range=(1,2))
count_vec = CountVectorizer(max_features = 7000)

In [26]:
x_train_features = count_vec.fit_transform(text_documents)
x_train_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [27]:
x_train_features.shape

(10980, 7000)

In [28]:
regex = re.compile('[a-zA-Z_-]+$')
re.match(regex,'Coding')

<re.Match object; span=(0, 6), match='Coding'>

In [29]:
count_vec.get_feature_names()

['__rwg__',
 '_austrian',
 '_defcon_',
 '_emmaclifford',
 '_exact_',
 '_justdippin_',
 'a_life_story_',
 'aa',
 'aaaand',
 'aadvantage',
 'aafail',
 'aal',
 'aaron',
 'aarp',
 'abandon',
 'abandonment',
 'abassinet',
 'abbreve',
 'abc',
 'abcnetwork',
 'abcnews',
 'abduct',
 'abi',
 'abigailedge',
 'ability',
 'able',
 'aboard',
 'aboout',
 'abounds',
 'abq',
 'abroad',
 'absolute',
 'absolutely',
 'absorber',
 'absoulutely',
 'absurd',
 'absurdity',
 'absurdly',
 'abt',
 'abundance',
 'abuse',
 'abysmal',
 'acc',
 'accelerate',
 'accept',
 'acceptable',
 'accepted',
 'acces',
 'access',
 'accessibility',
 'accessible',
 'accident',
 'accidentally',
 'accidents',
 'accomidating',
 'accommodate',
 'accommodation',
 'accompaniment',
 'accompany',
 'accomplish',
 'accord',
 'according',
 'accordingly',
 'account',
 'accountability',
 'accrue',
 'acct',
 'accts',
 'accumulation',
 'accurate',
 'accurately',
 'accuratetraveltimes',
 'accuse',
 'achieve',
 'ack',
 'acknowledge',
 'acknowledg

In [30]:
x_test_features = count_vec.transform(test_text_documents)
x_test_features

<3660x7000 sparse matrix of type '<class 'numpy.int64'>'
	with 31492 stored elements in Compressed Sparse Row format>

In [31]:
xtrain=x_train_features
ytrain=categories
xtest=x_test_features

In [32]:
import numpy as np

In [34]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
svc=SVC(kernel="rbf")
grid={'C':[1e2,1e3,5e3,1e4,5e4,1e5],'gamma':[1e-3,5e-4,1e-4,5e-3]}
abc=GridSearchCV(svc,grid)
abc.fit(xtrain, ytrain)



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [100.0, 1000.0, 5000.0, 10000.0, 50000.0, 100000.0], 'gamma': [0.001, 0.0005, 0.0001, 0.005]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [35]:
abc.best_estimator_

SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [36]:
abc.cv_results_



{'mean_fit_time': array([ 6.16810036,  6.8836027 ,  7.18197584,  7.2391758 ,  7.72147179,
         6.90481488,  6.02382119,  8.5236783 ,  9.98517799,  9.05411275,
         6.88218331,  9.02098378, 10.95878967, 10.22845769,  7.71940279,
         8.87591879, 13.27506653, 13.87338686, 10.78683956,  9.2880017 ,
        13.90355825, 16.27112873, 14.04606263,  8.76593781]),
 'std_fit_time': array([0.46915358, 0.55478221, 0.7511773 , 0.32724464, 0.31437583,
        0.14244935, 0.04372076, 0.19898889, 0.39900811, 0.22508374,
        0.09202093, 0.15425529, 0.66246323, 0.10041724, 0.21907091,
        0.07071923, 0.71496422, 1.0269496 , 0.09986754, 0.40861914,
        0.61040085, 2.02738104, 0.28707491, 0.18835501]),
 'mean_score_time': array([2.25530195, 2.50315126, 2.94774143, 2.24383664, 2.07871675,
        2.09425004, 2.28678179, 2.08673811, 1.91253948, 1.91331967,
        2.07172163, 1.93278925, 1.84066502, 1.92319671, 1.94424725,
        1.90443484, 1.81393313, 1.80379375, 1.9245611 , 1.93

In [37]:
svc = SVC(C=100,kernel='rbf',gamma=0.001)
svc.fit(xtrain, ytrain)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [38]:
svc=svc.predict(xtest)
svc=np.array(svc)
print(svc)
np.savetxt("twitter_pred_svc_rbf.csv",svc, fmt='%s',encoding=None)

['negative' 'negative' 'negative' ... 'negative' 'positive' 'negative']


In [None]:
"""#MULTINOMIAL NAIVE BAYES SCORE TEST

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=0.83)
clf.fit(xtrain.toarray(), ytrain)"""

In [None]:
"""mnb=clf.predict(xtest)
mnb=np.array(mnb)
mnb"""

In [None]:
"""np.savetxt("twitter_pred_mnb.csv",mnb, fmt='%s',encoding=None)"""