#### Importing Libraries

In [1]:
import pandas as pd
import numpy as np

import math
import re

#### Reading the CSV file

In [2]:
df = pd.read_csv('Tweets.csv')
df.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@USAirways Is there a phone line to call into ...
1,positive,@united Bag was finally delivered and intact. ...
2,positive,@usairways Thanks to Kevin and team at F38ish ...
3,negative,"@AmericanAir Yes, talked to them. FLL says is ..."
4,negative,@VirginAmerica and it's a really big bad thing...


#### Sorting the values based on the `airline_sentiment`

In [3]:
df.sort_values('airline_sentiment', inplace = True)
df.reset_index(drop = True, inplace = True)
df.head()

Unnamed: 0,airline_sentiment,text
0,negative,@USAirways The fact we did not get notified hi...
1,negative,@united I'd thank you - but you didn't help. ...
2,negative,@USAirways / @AmericanAir don't forget without...
3,negative,@USAirways seriously!!! Flight Cancelled Flig...
4,negative,"@united Why tell us flight is delayed, then te..."


#### Observing the unique type of classes

In [4]:
classes = list(df['airline_sentiment'].unique())
classes

['negative', 'neutral', 'positive']

### Stratified Data Splitting Function
#### Creating a data splitting function which splits in 80 - 20 train-test split

In [5]:
def dataSplit(df):
    train_dfs = []
    test_dfs = []
    index_loc = 0
    class_splits = []
    index = []
    for c in classes:
        tweets = len(df[df['airline_sentiment'] == c])
        class_splits.append(tweets)
        test_tweets = int(0.2*tweets)
        train_tweets = tweets - test_tweets
        
        train_dfs.append(df.iloc[np.arange(index_loc, index_loc + train_tweets)])
        index_loc += train_tweets
        index.append(index_loc)
        
        test_dfs.append(df.iloc[np.arange(index_loc, index_loc + test_tweets)])
        index_loc += test_tweets
        index.append(index_loc)
        
        

    train_df = pd.concat(train_dfs)
    test_df = pd.concat(test_dfs)

    print('Training Data (80% of data): ',len(train_df))
    print('Testing Data (20% of data): ',len(test_df))

    
    return train_df, test_df, index

In [6]:
train_df, test_df, indices = dataSplit(df)
print('The splits are: ',indices)

Training Data (80% of data):  11714
Testing Data (20% of data):  2926
The splits are:  [7343, 9178, 11658, 12277, 14168, 14640]


#### Test & Training Data Splits

In [7]:
print('Negative (Training): ', indices[0] - 0)
display(train_df.loc[0:indices[0]])
print('Negative (Testing): ', indices[1] - indices[0])
display(test_df.loc[indices[0]:indices[1]])

Negative (Training):  7343


Unnamed: 0,airline_sentiment,text
0,negative,@USAirways The fact we did not get notified hi...
1,negative,@united I'd thank you - but you didn't help. ...
2,negative,@USAirways / @AmericanAir don't forget without...
3,negative,@USAirways seriously!!! Flight Cancelled Flig...
4,negative,"@united Why tell us flight is delayed, then te..."
...,...,...
7338,negative,"@united enough already with the poor service, ..."
7339,negative,"@united, any options to cos tonight? Doesn't l..."
7340,negative,@AmericanAir how hard is it to have catering r...
7341,negative,@united tried that already &amp; tried forgett...


Negative (Testing):  1835


Unnamed: 0,airline_sentiment,text
7343,negative,@VirginAmerica why are your first fares in May...
7344,negative,@USAirways the disappointment was not the bag ...
7345,negative,@united - with airport self checkin your only ...
7346,negative,@united the wifi in the ewr lounge reminds me ...
7347,negative,@AmericanAir your assistance has not been very...
...,...,...
9173,negative,@JetBlue didn't find them unfortunately :( but...
9174,negative,@united Contacted yesterday and was told that ...
9175,negative,@JetBlue I tried! Had me running from curb-sid...
9176,negative,.@USAirways and then you said I could move my ...


In [8]:
print('Neutral (Training): ', indices[2] - indices[1])
display(train_df.loc[indices[1]:indices[2]])
print('Neutral (Testing): ', indices[3] - indices[2])
display(test_df.loc[indices[2]:indices[3]])

Neutral (Training):  2480


Unnamed: 0,airline_sentiment,text
9178,neutral,@JetBlue Says #Lufthansa Incentive Offer To Ha...
9179,neutral,@united past
9180,neutral,"@JetBlue 1st flight of the morning, flying wit..."
9181,neutral,@united assume those benefits only apply to my...
9182,neutral,@JetBlue I did get the email. Thought i wasn't...
...,...,...
11653,neutral,@united @estellevw does she need to complain o...
11654,neutral,@united have all 747-400s been updated with po...
11655,neutral,"@jetblue captain ""takes as lot of muscles to f..."
11656,neutral,@SouthwestAir can take u to Midway-Chicago Mar...


Neutral (Testing):  619


Unnamed: 0,airline_sentiment,text
11658,neutral,@united instead of Bourbon street how about #q...
11659,neutral,@AmericanAir Why did AA973 return to JFK? Tha...
11660,neutral,@VirginAmerica @ladygaga @carrieunderwood All ...
11661,neutral,@VirginAmerica Adds Pillows Instead of Lie-Fla...
11662,neutral,@SouthwestAir quick question - i bought wifi f...
...,...,...
12272,neutral,@SouthwestAir I can easily get to the Atlanta ...
12273,neutral,@united we will be at the airport first thing ...
12274,neutral,@SouthwestAir Is there any way to add a Known ...
12275,neutral,@AmericanAir Do you have any flights with lie ...


In [9]:
print('Positive (Training): ', indices[4] - indices[3])
display(train_df.loc[indices[3]:indices[4]])
print('Positive (Testing): ', indices[5] - indices[4])
display(test_df.loc[indices[4]:indices[5]])

Positive (Training):  1891


Unnamed: 0,airline_sentiment,text
12277,positive,@VirginAmerica I love the dancing little richa...
12278,positive,@USAirways Shavon at customer service desk in ...
12279,positive,@SouthwestAir YES please....How do we get that...
12280,positive,Thanks @JetBlue. Next up we will see how the s...
12281,positive,@USAirways Just spoke with a representative. M...
...,...,...
14163,positive,@JetBlue got it. thanks the quick reply.
14164,positive,@AmericanAir @contactcej thanks!
14165,positive,@USAirways We're having 2 grandbabies in 2 wee...
14166,positive,@united this will definitely be a trip to reme...


Positive (Testing):  472


Unnamed: 0,airline_sentiment,text
14168,positive,@SouthwestAir all good now. Going to make it t...
14169,positive,@JetBlue ready to go to Disneyworld! For the ...
14170,positive,@JetBlue @FerrisSalameh Love JetBlue's speedy ...
14171,positive,@SouthwestAir Thank you for the tip!
14172,positive,@USAirways thanks :)
...,...,...
14635,positive,@united Thank you!! 😊
14636,positive,@USAirways Just talked to reservation. Must ...
14637,positive,@SouthwestAir Thanks for the response. Was abl...
14638,positive,@SouthwestAir Thanks to your team for dealing ...


#### Creating a list of stop_words

In [10]:
stop_words = ['i','me','my','myself','we','our','ours','ourselves','you',"you're","you've","you'll","you'd",'your','yours',
'yourself','yourselves','he','him','his','himself','she',"she's",'her','hers','herself','it',"it's",'its','itself','they',
'them','their','theirs','themselves','what','which','who','whom','this','that',"that'll",'these','those','am','is','are','was',
'were','be','been','being','have','has','had','having','do','does','did','doing','a','an','the','and','but','if','or','because',
'as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above',
'below','to','from','up','down','in','out','on','off','over','under','again','further','then','once','here','there','when',
'where','why','how','all','any','both','each','few','more','most','other','some','such','no','nor','not','only','own','same',
'so','than','too','very','s','t','can','will','just','don',"don't",'should',"should've",'now','d','ll','m','o','re','ve','y',
'ain','aren',"aren't",'couldn',"couldn't",'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',"hasn't",'haven',"haven't",
'isn',"isn't",'ma','mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'shan',"shan't",'shouldn',"shouldn't",'wasn',"wasn't",
'weren',"weren't",'won',"won't",'wouldn',"wouldn't"]

 ### Bag Of Words Implementation

In [11]:
from emoji import UNICODE_EMOJI

def is_emoji(sentence):
    emoji_list = [c for c in sentence if c in UNICODE_EMOJI]
    return emoji_list


In [14]:
def bagOfWords(temp_df, vocabulary, mode = None):
    # Here we add all the words to a general dictionary and see the count of each word
    if mode == 'train':
        
        classDict = {}
        for c in classes:
            classDict[c] = {}

        countDict = {}
        for i in range(len(temp_df)):
            emoji_list = is_emoji(temp_df.iloc[i]['text'])
            words = (re.sub('[^@#A-Za-z\s]+',' ', temp_df.iloc[i]['text'])).lower().split()
            for w in words:
                if '@' in w or '#' in w or len(w) < 4 or len(w) > 15:
                    continue
                elif w in stop_words:
                    continue
                else:
                    vocabulary.add(w)
                    try:
                        countDict[w]  += 1
                    except:
                        countDict[w]  = 1

            [vocabulary.add(c) for c in emoji_list]

            
            
            
            
        # Here we remove the count of all the words with only a single entrance
        countDict = {k: v for k, v in sorted(countDict.items(), key=lambda item: item[1])}
        for i in countDict:
            if countDict[i] == 1:
                vocabulary.discard(i)

                
                
                
    # Here we are preparing the class dictionary and the dataframe that needs to have text in list form            
    for i in range(len(temp_df)):
        text_words = []
        emoji_list = is_emoji(temp_df.iloc[i]['text'])
        words = (re.sub('[^@#A-Za-z\s]+',' ', temp_df.iloc[i]['text'])).lower().split()
        for w in words:
            if '@' in w or '#' in w or len(w) < 4 or len(w) > 15:
                continue
            elif w in stop_words:
                continue
            elif w in vocabulary:
                text_words.append(w)
                if mode == 'train':
                    try:
                        classDict[temp_df.iloc[i]['airline_sentiment']][w] += 1
                    except:
                        classDict[temp_df.iloc[i]['airline_sentiment']][w] = 1
                    
                    for c in classes:
                        try:
                            if classDict[c][w] != 0:
                                continue
                        except:
                            classDict[c][w] = 0
                    
        for e in emoji_list:
            if mode == 'train':
                try:
                    classDict[temp_df.iloc[i]['airline_sentiment']][e] += 1
                except:
                    classDict[temp_df.iloc[i]['airline_sentiment']][e] = 1

                for c in classes:
                    try:
                        if classDict[c][e] != 0:
                            continue
                    except:
                        classDict[c][e] = 0
                
            
        text_words += emoji_list
        temp_df.iloc[i]['text'] = text_words
    if mode == 'train':   
        return temp_df, vocabulary, classDict
    elif mode == 'test':
        return temp_df
    else:
        print('Error: No mode provided.')

In [15]:
vocabulary = set()
tr_df, vocab, class_dict = bagOfWords(train_df, vocabulary, 'train')
print(len(vocab))
tr_df

4256


Unnamed: 0,airline_sentiment,text
0,negative,"[fact, notified, ability, look, alternate, fli..."
1,negative,"[thank, help, taking, hours, reply, message, m..."
2,negative,"[forget, without, customers, would, business]"
3,negative,"[seriously, flight, cancelled, flighted, auto,..."
4,negative,"[tell, flight, delayed, tell, time, spare, del..."
...,...,...
14163,positive,"[thanks, quick, reply]"
14164,positive,[thanks]
14165,positive,"[weeks, travel, thank, reasonable, fares, satu..."
14166,positive,"[definitely, trip, remember, second, season, f..."


#### Add-1 Smoothing

In [16]:
totals = [0,0,0]
for i, c in enumerate(classes):
    for w in class_dict[c]:
        class_dict[c][w] += 1
        totals[i] += class_dict[c][w]
print(classes)
totals

['negative', 'neutral', 'positive']


[60295, 17443, 14995]

#### Preprocessing The Test Data

In [17]:
ts_df = bagOfWords(test_df, vocab, 'test')
ts_df

Unnamed: 0,airline_sentiment,text
7343,negative,"[first, fares, three, times, carriers, seats, ..."
7344,negative,"[disappointment, arrived, kept, without, options]"
7345,negative,"[airport, self, checkin, option, employee, tra..."
7346,negative,"[wifi, lounge, days, dial]"
7347,negative,"[assistance, helpful, frustrating, disappointi..."
...,...,...
14635,positive,"[thank, 😊]"
14636,positive,"[talked, reservation, must, friendly, good, us..."
14637,positive,"[thanks, response, able, situation, resolved, ..."
14638,positive,"[thanks, team, dealing, flight, houston]"


#### Maximum A Prosteriori (HMap)

In [18]:
Prob_Class = [0,0,0]
for i, c in enumerate(classes):
    Prob_Class[i] = len(tr_df[tr_df['airline_sentiment'] == c])/float(len(tr_df))
print(classes)
Prob_Class    

['negative', 'neutral', 'positive']


[0.6268567526037221, 0.21171248079221444, 0.16143076660406352]

In [19]:
Y_Predicted = []
for i in range(len(ts_df)):
    HMap = [0,0,0]
    for j, c in enumerate(classes):
        HMap[j] = math.log(Prob_Class[j])
        for w in ts_df.iloc[i]['text']:
            try:
                WordProb = class_dict[c][w]/float(totals[j])
                HMap[j] += math.log(WordProb)
            except:
                continue
        
    Y_Predicted.append(classes[np.asarray(HMap).argmax()])
    
Y_Predicted

['negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'neutral',
 'negative',
 'negative',
 'negative',
 'neutral',
 'negative',
 'negative',
 'negative',
 'neutral',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 '

#### Creating a Confusion Matrix on the basis of the predicted result

In [20]:
def confusionMatrix(Y_Pred, Y_Test):
    Y_A = pd.Series(Y_Test, name='Actual')
    Y_P = pd.Series(Y_Pred, name='Predicted')
    df_confusion = pd.crosstab(Y_P, Y_A, margins=True)
    display(df_confusion)
    return df_confusion
df_conf = confusionMatrix(Y_Predicted, list(test_df['airline_sentiment']))

Actual,negative,neutral,positive,All
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
negative,1665,310,124,2099
neutral,112,250,56,418
positive,58,59,292,409
All,1835,619,472,2926


### Evaluation Report

In [21]:
def evaluationReport(df_conf):
    print('EVALUATION REPORT \n')
    print('-----------------')
    df_c = df_conf.iloc[:-1,:-1]
    TP, FP, FN, TN, P, R, F, A = [], [], [], [], [], [], [], []
    for i, c in enumerate(classes):
        TP.append(df_c.iloc[i][i])
        FP.append(sum(df_c.iloc[i,:]) - TP[-1])
        FN.append(sum(df_c.iloc[:,i]) - TP[-1])
        TN.append(df_conf.iloc[3,3] - TP[-1] - FP[-1] - FN[-1])
        A.append((TP[-1]+TN[-1])/(df_conf.iloc[3,3]))
        P.append( (TP[-1])/(TP[-1] + FP[-1]) )
        R.append( (TP[-1])/(TP[-1] + FN[-1]) )
        F.append( (2*P[-1]*R[-1])/(P[-1]+R[-1]) )
        print('With Respect To: ', c.capitalize())
        print('\n')
        print('    True Positives : ', TP[-1])
        print('    True Negatives : ', TN[-1])
        print('    False Positives: ', FP[-1])
        print('    False Negatives: ', FN[-1])
        print('    Accuracy       : ', (A[-1]))
        print('    Precision      : ', (P[-1]))
        print('    Recall         : ', (R[-1]))
        print('    F1-Score       : ', (F[-1]))
        print('\n')
    print('-----------------')
    print('Macro Average (Accuracy) : ', sum(A)/float(3))
    print('Macro Average (Precision): ', sum(P)/float(3))
    print('Macro Average (Recall)   : ', sum(R)/float(3))
    print('Macro Average (F1-Score) : ', (sum(F))/float(3))
    print('\n')
    p = (sum(TP))/(sum(TP)+sum(FP))
    r = (sum(TP))/(sum(TP)+sum(FN))
    print('Micro Average (Accuracy) : ', (sum(TP) + sum(TN))/( sum(TP)+ sum(FN) + sum(FP) + sum(TN)))
    print('Micro Average (Precision): ', (sum(TP))/(sum(TP)+sum(FP)))
    print('Micro Average (Recall)   : ', (sum(TP))/(sum(TP)+sum(FN)))
    print('Micro Average (F1-Score) : ', (2*p*r)/(p+r))
    
evaluationReport(df_conf)

EVALUATION REPORT 

-----------------
With Respect To:  Negative


    True Positives :  1665
    True Negatives :  657
    False Positives:  434
    False Negatives:  170
    Accuracy       :  0.7935748462064252
    Precision      :  0.7932348737494045
    Recall         :  0.9073569482288828
    F1-Score       :  0.8464667005592272


With Respect To:  Neutral


    True Positives :  250
    True Negatives :  2139
    False Positives:  168
    False Negatives:  369
    Accuracy       :  0.816473000683527
    Precision      :  0.5980861244019139
    Recall         :  0.40387722132471726
    F1-Score       :  0.4821600771456123


With Respect To:  Positive


    True Positives :  292
    True Negatives :  2337
    False Positives:  117
    False Negatives:  180
    Accuracy       :  0.8984962406015038
    Precision      :  0.7139364303178484
    Recall         :  0.6186440677966102
    F1-Score       :  0.662883087400681


-----------------
Macro Average (Accuracy) :  0.836181362497152


#### _The End_