In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
file_path='/content/drive/MyDrive/TrueFoundry/airline_sentiment_analysis.csv'

In [None]:
import pandas as pd 
class reader(): #class for reading the training data
  def __init__(self):
    self.df=pd.read_csv(file_path)
  def restructure(self):
    res=self.df.drop(['Unnamed: 0'], axis=1)
    columns_titles = ["text","airline_sentiment"]
    res=res.reindex(columns=columns_titles)
    return res

In [None]:
read=reader()
df=read.restructure()
df.head()

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica plus you've added commercials t...,positive
1,@VirginAmerica it's really aggressive to blast...,negative
2,@VirginAmerica and it's a really big bad thing...,negative
3,@VirginAmerica seriously would pay $30 a fligh...,negative
4,"@VirginAmerica yes, nearly every time I fly VX...",positive


In [None]:
df['airline_sentiment'].value_counts()

negative    9178
positive    2363
Name: airline_sentiment, dtype: int64

In [None]:
from sklearn.utils import resample,shuffle
df_1 = df[df['airline_sentiment'] == 'positive']
other_df = df[df['airline_sentiment'] == 'negative']
df_1_upsampled = resample(df_1,n_samples=10000,random_state=40)
df = pd.concat([df_1_upsampled,other_df])
print(df['airline_sentiment'].value_counts())
df.head(3)

positive    10000
negative     9178
Name: airline_sentiment, dtype: int64


Unnamed: 0,text,airline_sentiment
10640,@AmericanAir thanks. They did not charge anyth...,positive
9348,@AmericanAir I was happy to purchase the upgra...,positive
10219,@AmericanAir SFO. Natt (the agent who helped m...,positive


In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder() #binary label 1: positive, 0: negative
df['airline_sentiment']= label_encoder.fit_transform(df['airline_sentiment'])
df.head(3)

Unnamed: 0,text,airline_sentiment
10640,@AmericanAir thanks. They did not charge anyth...,1
9348,@AmericanAir I was happy to purchase the upgra...,1
10219,@AmericanAir SFO. Natt (the agent who helped m...,1


In [None]:
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're','s', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

In [None]:
import re
import string
import nltk

def cleaning_mentions(text):
  return re.sub("@[A-Za-z0-9_]+","", text)
df['text'] = df['text'].apply(lambda x: cleaning_mentions(x))

def cleaning_non_alpha(text):
  return re.sub("[^a-z0-9]"," ", text)
df['text'] = df['text'].apply(lambda x: cleaning_non_alpha(x))

STOPWORDS = set(stopwordlist) #cleaning stopwords
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
df['text'] = df['text'].apply(lambda text: cleaning_stopwords(text))

def cleaning_URLs(data): #cleaning URLs (if any)
    return re.sub('((www.[^s]+)|(https?://[^s]+))',' ', data)
df['text'] = df['text'].apply(lambda x: cleaning_URLs(x))

english_punctuations = string.punctuation #removing punctuations
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)
df['text'] = df['text'].apply(lambda x: cleaning_punctuations(x))

def cleaning_repeating_char(text): #removing repeating characters
    return re.sub(r'(.)1+', r'1', text)
df['text'] = df['text'].apply(lambda x: cleaning_repeating_char(x))

def cleaning_numbers(data): #removing numbers
    return re.sub('[0-9]+', '', data)
df['text'] = df['text'].apply(lambda x: cleaning_numbers(x))

st = nltk.PorterStemmer() #stemming is generally more suitable for sentiment analysis problems
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return data
df['text'] = df['text'].apply(lambda x: stemming_on_text(x))

# from nltk.stem import WordNetLemmatizer
# wordnet_lemmatizer = WordNetLemmatizer()
# def lemmatizer(text):
#   lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
#   return lemm_text
# df['text'] = df['text'].apply(lambda x:lemmatizer(x))

df.head(3)

Unnamed: 0,text,airline_sentiment
10640,thanks hey not charge anything end good,1
9348,happy purchase upgrade f avail next flight,1
10219,att agent helped really awesome job,1


In [None]:
df=df.sample(frac=1)
df.head()

Unnamed: 0,text,airline_sentiment
5652,thanks replying feel little better see goes,1
8504,flight ondon sitting tarmac min w update pilo...,0
3027,nd ate light flight today,0
2980,tell staff boarded group end queue old member...,0
4431,outh ound olleyball team way http co c ld,1


In [None]:
#defining & calculating maxlen for pad sequencing
maxlen=0
for i in df.index:
  t=0
  for id in df['text'][i]:
    if(id==' '):
      t=t+1
  maxlen=max(maxlen,t+1)
print(maxlen)

24


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.text, df.airline_sentiment, test_size=0.1, random_state=37)
X_train[0]

0    plus added commercials experience tacky
0    plus added commercials experience tacky
Name: text, dtype: object

In [None]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
word_index=tokenizer.word_index
vocab_size = len(word_index)+1
print(vocab_size)

10460


In [None]:
X_learn = tokenizer.texts_to_sequences(X_train)
X_eval = tokenizer.texts_to_sequences(X_test)
len(X_learn)

17260

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train_pad = pad_sequences(X_learn, padding='post', maxlen=maxlen)
X_test_pad = pad_sequences(X_eval, padding='post', maxlen=maxlen)

In [None]:
import numpy as np
embeddings_index = dict() #mapping of various words in GloVe Vocab
f = open('/content/drive/MyDrive/NLP_covid/NLP_covid/glove.6B.200d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [None]:
# create a weight matrix for words in training docs
embedding_matrix = np.random.random((vocab_size,200))
for word, i in word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix.shape

(10460, 200)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU, SpatialDropout1D
# from keras.optimizers import SGD

class classifier(): #model and training class
  def __init__(self,model):
    self.model=model
  def add_layers(self,model):
    self.model.add(Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=maxlen))
    self.model.add(LSTM(units=32,dropout=0.2, recurrent_dropout=0.2))
    self.model.add(Dense(1,activation='sigmoid'))
    return self.model
  def compiler(self,model):
    self.model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics = ['accuracy'])
  def summary(self,model):
    print(self.model.summary())
  def fit(self,model,x,y,epoch, step, batch_size=32, verbose='auto'):
    self.model.fit(x,y,epochs=epoch, steps_per_epoch=step, batch_size=batch_size, verbose=verbose)

In [None]:
model=Sequential()
obj=classifier(model)
model=obj.add_layers(model)
obj.compiler(model)
obj.summary(model)



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 24, 200)           2092000   
                                                                 
 lstm (LSTM)                 (None, 32)                29824     
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 2,121,857
Trainable params: 2,121,857
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
obj.fit(model,X_train_pad, y_train, 10, 100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
from sklearn.metrics import accuracy_score
y_sol=model.predict(X_test_pad)
y_pred=np.argmax(y_sol,axis=1)
print(accuracy_score(y_test, y_pred))

0.4880083420229406


In [None]:
#Inference class:

class inference():
  def __init__(self,exp):
    self.exp=exp
  def preprocess(self,exp):
    exp=cleaning_mentions(exp)
    exp=cleaning_non_alpha(exp)
    exp=cleaning_numbers(exp)
    exp=cleaning_URLs(exp)
    exp=cleaning_punctuations(exp)
    exp=cleaning_repeating_char(exp)
    exp=cleaning_numbers(exp)
    exp=stemming_on_text(exp)
    exp = tokenizer.texts_to_sequences([exp])
    exp = pad_sequences(exp, padding='post', maxlen=maxlen)
    return exp
  def pred(self,exp):
    pred=model.predict(exp)
    pr=np.argmax(pred,axis=1)
    if(pr==1):
      return ["Positive",pred]
    else:
      return ["Negative",pred]

In [None]:
exp="great!"
obj2=inference(exp)
exp=obj2.preprocess(exp)
print(obj2.pred(exp))

['Negative', array([[0.9872341]], dtype=float32)]
