In [1]:
## imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
## importing the data:

df = pd.read_csv("/content/drive/MyDrive/US airline twitter sentiment/Tweets.csv")

In [3]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [4]:
df.columns

Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')

In [5]:
## Setting up X and Y:

X = df["text"]
Y = df["airline_sentiment"]

In [6]:
X.shape, Y.shape

((14640,), (14640,))

In [7]:
Y.value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [8]:
X.head()

0                  @VirginAmerica What @dhepburn said.
1    @VirginAmerica plus you've added commercials t...
2    @VirginAmerica I didn't today... Must mean I n...
3    @VirginAmerica it's really aggressive to blast...
4    @VirginAmerica and it's a really big bad thing...
Name: text, dtype: object

In [9]:
pd.set_option("display.max_colwidth",120)
X.head(10)

0                                                                                        @VirginAmerica What @dhepburn said.
1                                                   @VirginAmerica plus you've added commercials to the experience... tacky.
2                                                    @VirginAmerica I didn't today... Must mean I need to take another trip!
3    @VirginAmerica it's really aggressive to blast obnoxious "entertainment" in your guests' faces &amp; they have littl...
4                                                                    @VirginAmerica and it's a really big bad thing about it
5    @VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.\nit's really the only bad t...
6                                            @VirginAmerica yes, nearly every time I fly VX this “ear worm” won’t go away :)
7               @VirginAmerica Really missed a prime opportunity for Men Without Hats parody, there. https://t.co/mWpG7grEZP


In [10]:
## Classifying tweets without balancing our dataset:

In [11]:
import regex as re

In [12]:
text = "text me at yadav.Deven-007@out.look.com"
re.sub(r'([a-zA-Z0-9._-]+@[a-zA-Z0-9.]+\.[a-zA-Z0-9]+)',"",text)

'text me at '

In [13]:
## removing emails:

X = X.apply(lambda x: re.sub(r'([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9]+)',"",x))

In [14]:
## removing urls from the text:

X = X.apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , x))

In [15]:
X = X.apply(lambda x: re.sub(r'(@[a-zA-Z0-9._-]+)',"",x))

In [16]:
X.head()

0                                                                                                         What  said.
1                                                           plus you've added commercials to the experience... tacky.
2                                                            I didn't today... Must mean I need to take another trip!
3     it's really aggressive to blast obnoxious "entertainment" in your guests' faces &amp; they have little recourse
4                                                                            and it's a really big bad thing about it
Name: text, dtype: object

In [17]:
## removing rt:

X = X.apply(lambda x: re.sub(r'\brt\b', '', x).strip())

In [18]:
X = X.apply(lambda x : str(x).lower().replace("\\"," ").replace("_"," ").replace("."," "))

In [19]:
## removing tagging of airline with @:

X = X.apply(lambda x : re.sub(r'(@+[a-z0-9._-]+)',"", x))

In [20]:
## removing hashtag:

X = X.apply(lambda x : re.sub(r'(#+[a-z0-9+._-]+)',"", x))

In [21]:
## removing accented characters:

import unicodedata

def remove_accented_chars(x):
    x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return x

In [22]:
X = X.apply(lambda x : remove_accented_chars(x))

In [23]:
## expanding words written in short forms:

## this list of words on which expansion would be applied,

contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how does",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
" u ": " you ",
" ur ": " your ",
" n ": " and ",
"won't": "would not",
'dis': 'this',
'bak': 'back',
'brng': 'bring',
"'ve": " have"}

In [24]:
## function for expansion:

def contraction_expansion(x):
  if type(x) is str:
    for key in contractions:
      value = contractions[key]
      x = x.replace(key,value)
    return x

  else:
    return x

X = X.apply(lambda x : contraction_expansion(x))

In [25]:
## removing special characters:

X = X.apply(lambda x : re.sub('[^A-Za-z]', ' ',x))

In [28]:
X.head()

0                                                                                                         what  said 
1                                                         plus you have added commercials to the experience    tacky 
2                                                           i did not today    must mean i need to take another trip 
3    it is really aggressive to blast obnoxious  entertainment  in your guests  faces  amp  they have little recourse
4                                                                           and it is a really big bad thing about it
Name: text, dtype: object

In [29]:
## removing extra spaces if present:

X = X.apply(lambda x: ' '.join(x.split()))

In [30]:
X.head()

0                                                                                                      what said
1                                                        plus you have added commercials to the experience tacky
2                                                          i did not today must mean i need to take another trip
3    it is really aggressive to blast obnoxious entertainment in your guests faces amp they have little recourse
4                                                                      and it is a really big bad thing about it
Name: text, dtype: object

In [31]:
type(X), type(Y)

(pandas.core.series.Series, pandas.core.series.Series)

In [32]:
import nltk

In [33]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [34]:
from nltk.corpus import stopwords

In [35]:
stop_words = stopwords.words('english')

In [36]:
stop_words[:20]

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his']

In [37]:
X.head()

0                                                                                                      what said
1                                                        plus you have added commercials to the experience tacky
2                                                          i did not today must mean i need to take another trip
3    it is really aggressive to blast obnoxious entertainment in your guests faces amp they have little recourse
4                                                                      and it is a really big bad thing about it
Name: text, dtype: object

In [38]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [39]:
from nltk.tokenize import word_tokenize

In [40]:
X = X.apply(lambda x: word_tokenize(x))

In [41]:
X.head()

0                                                                                                               [what, said]
1                                                          [plus, you, have, added, commercials, to, the, experience, tacky]
2                                                         [i, did, not, today, must, mean, i, need, to, take, another, trip]
3    [it, is, really, aggressive, to, blast, obnoxious, entertainment, in, your, guests, faces, amp, they, have, little, ...
4                                                                       [and, it, is, a, really, big, bad, thing, about, it]
Name: text, dtype: object

In [42]:
X = X.apply(lambda x: [word for word in x if word not in stop_words])

In [43]:
X.head()

0                                                                                         [said]
1                                                  [plus, added, commercials, experience, tacky]
2                                                 [today, must, mean, need, take, another, trip]
3    [really, aggressive, blast, obnoxious, entertainment, guests, faces, amp, little, recourse]
4                                                                      [really, big, bad, thing]
Name: text, dtype: object

In [44]:
Y = Y.map({"negative":0,"neutral":1,"positive":2})

In [45]:
Y.value_counts()

0    9178
1    3099
2    2363
Name: airline_sentiment, dtype: int64

In [46]:
type(X)

pandas.core.series.Series

In [47]:

X[:2]

0                                           [said]
1    [plus, added, commercials, experience, tacky]
Name: text, dtype: object

In [48]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [49]:
from nltk.stem import WordNetLemmatizer
lemmetizer = WordNetLemmatizer()

In [50]:
X = X.apply(lambda x: [lemmetizer.lemmatize(word) for word in x])

In [51]:
X = X.apply(lambda x: ' '.join(x))

In [52]:
X[:5]

0                                                                              said
1                                            plus added commercial experience tacky
2                                            today must mean need take another trip
3    really aggressive blast obnoxious entertainment guest face amp little recourse
4                                                              really big bad thing
Name: text, dtype: object

In [53]:
x = X

In [54]:
x_2 = X

In [55]:
## we need to convert all our data into numbers before we can give to any of our model.
## we will be using tfidf vectorizer for converting our text data into numbers.

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(X).toarray()

In [56]:
X[:4]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [57]:
X.shape

(14640, 8789)

In [58]:
## preparing training and test dataset:

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2,random_state = 0, stratify=Y, shuffle=True)

In [59]:
## training our data with svm model:

from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [60]:
svm = LinearSVC(class_weight='balanced')
svm.fit(x_train,y_train)

LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [61]:
y_pred = svm.predict(x_test)

In [62]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85      1778
           1       0.61      0.55      0.58       689
           2       0.68      0.69      0.69       461

    accuracy                           0.76      2928
   macro avg       0.71      0.70      0.70      2928
weighted avg       0.76      0.76      0.76      2928



In [63]:
from sklearn.linear_model import LogisticRegression

In [64]:
Y.value_counts()

0    9178
1    3099
2    2363
Name: airline_sentiment, dtype: int64

In [65]:
lr = LogisticRegression(class_weight='balanced')

In [66]:
lr.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [67]:
y_pred = lr.predict(x_test)

In [68]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.77      0.89      0.83      1582
           1       0.70      0.51      0.59       845
           2       0.71      0.67      0.69       501

    accuracy                           0.75      2928
   macro avg       0.73      0.69      0.70      2928
weighted avg       0.74      0.75      0.74      2928



In [69]:
x.head()

0                                                                              said
1                                            plus added commercial experience tacky
2                                            today must mean need take another trip
3    really aggressive blast obnoxious entertainment guest face amp little recourse
4                                                              really big bad thing
Name: text, dtype: object

In [70]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [71]:
tokenizer = Tokenizer()

In [72]:
x[:2]

0                                      said
1    plus added commercial experience tacky
Name: text, dtype: object

In [73]:
tokenizer.fit_on_texts(x)

In [74]:
x[:4]

0                                                                              said
1                                            plus added commercial experience tacky
2                                            today must mean need take another trip
3    really aggressive blast obnoxious entertainment guest face amp little recourse
Name: text, dtype: object

In [75]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

8810

In [76]:
## vocab size gives counts all of words present in the text data:

In [77]:
encoded_text = tokenizer.texts_to_sequences(x)

In [78]:
encoded_text[:2]

[[123], [398, 925, 1051, 104, 4891]]

In [79]:
list_lengths = []
for item in encoded_text:
  length = len(item)
  list_lengths.append(length)
max(list_lengths)


22

In [80]:
max_length = 25

In [81]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [82]:
x = pad_sequences(sequences=encoded_text, maxlen=max_length, padding='post')

In [83]:
x_train,x_test,y_train,y_test = train_test_split(x,Y,random_state = 0,test_size = 0.2, stratify=Y)

In [84]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Flatten,Dense,GlobalAveragePooling1D,Dropout,Conv1D,MaxPool1D,GlobalMaxPool1D

In [85]:
vec_size = 128

model = Sequential()

model.add(Embedding(input_dim=vocab_size, output_dim=vec_size, input_length=max_length))
model.add(Conv1D(64,2,activation='relu'))
model.add(Dropout(0.7))

model.add(Conv1D(32,2,activation='relu'))
model.add(MaxPool1D(2))
model.add(Dropout(0.6))

model.add(Dense(8,activation='relu'))

model.add(GlobalAveragePooling1D())

model.add(Dense(3,activation='softmax'))


In [86]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 128)           1127680   
_________________________________________________________________
conv1d (Conv1D)              (None, 24, 64)            16448     
_________________________________________________________________
dropout (Dropout)            (None, 24, 64)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 23, 32)            4128      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 11, 32)            0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 11, 32)            0         
_________________________________________________________________
dense (Dense)                (None, 11, 8)             2

In [87]:
from tensorflow.keras.optimizers import Adam

In [88]:
from sklearn.utils import class_weight

In [89]:
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)

In [90]:
class_weights

array([0.53173522, 1.57482856, 2.06451613])

In [91]:
class_weightss = {0:0.53420909, 1:1.57102616, 2: 2.03439291}

In [92]:
from tensorflow.keras.utils import to_categorical

In [93]:
y_train = to_categorical(y_train,3)
y_test = to_categorical(y_test,3)

In [94]:
model.compile(optimizer=Adam(learning_rate=0.001), loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.fit(x_train, y_train, epochs = 5, validation_data=(x_test, y_test), class_weight=class_weightss)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f0ca9740510>

In [95]:
from tensorflow.keras.layers import LSTM

In [96]:
vec_size = 128
model_2 = Sequential()
model_2.add(Embedding(vocab_size,vec_size,input_length=max_length))
model_2.add(LSTM(units=64))
model_2.add(Dense(3,activation="softmax"))

In [97]:
model_2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 128)           1127680   
_________________________________________________________________
lstm (LSTM)                  (None, 64)                49408     
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 195       
Total params: 1,177,283
Trainable params: 1,177,283
Non-trainable params: 0
_________________________________________________________________


In [98]:
model_2.compile(optimizer=Adam(learning_rate=0.001), loss = 'categorical_crossentropy', metrics = ['accuracy'])
model_2.fit(x_train, y_train, epochs = 5, validation_data=(x_test, y_test), shuffle = True, class_weight=class_weightss)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f0ca95ddd10>