<a href="https://colab.research.google.com/github/victoriaporter58/Airline-Sentiment-Analysis-using-Machine-Learning/blob/main/Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cleaning and Preprocessing the Data

In [None]:
import pandas as pd
import nltk
from bs4 import BeautifulSoup
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [None]:
df = pd.read_csv('Data Science TDA Group Project/Task2/Cleaning/Tweets2.csv')
df.shape

(14640, 15)

# Tokenize
The breaks up the strings into a list of words.

In [None]:
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
df['text'] = df['text'].apply(lambda x: tokenizer.tokenize(x))
df['text'].head(20)


0                 [VirginAmerica, What, dhepburn, said]
1     [VirginAmerica, plus, you, ve, added, commerci...
2     [VirginAmerica, I, didn, t, today, Must, mean,...
3     [VirginAmerica, it, s, really, aggressive, to,...
4     [VirginAmerica, and, it, s, a, really, big, ba...
5     [VirginAmerica, seriously, would, pay, 30, a, ...
6     [VirginAmerica, yes, nearly, every, time, I, f...
7     [VirginAmerica, Really, missed, a, prime, oppo...
8     [virginamerica, Well, I, didn, t, but, NOW, I,...
9     [VirginAmerica, it, was, amazing, and, arrived...
10    [VirginAmerica, did, you, know, that, suicide,...
11    [VirginAmerica, I, lt, 3, pretty, graphics, so...
12    [VirginAmerica, This, is, such, a, great, deal...
13    [VirginAmerica, virginmedia, I, m, flying, you...
14                              [VirginAmerica, Thanks]
15    [VirginAmerica, SFO, PDX, schedule, is, still,...
16    [VirginAmerica, So, excited, for, my, first, c...
17    [VirginAmerica, I, flew, from, NYC, to, SF

# Remove Stop Words

These words have a low predictive power.

In [None]:
def remove_stopwords(text):
  words = [w for w in text if w not in stopwords.words('english')]
  return words


In [None]:
nltk.download('stopwords')
df['text'] = df['text'].apply(lambda x : remove_stopwords(x))
df['text'].head(10)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


0                [VirginAmerica, What, dhepburn, said]
1    [VirginAmerica, plus, added, commercials, expe...
2    [VirginAmerica, I, today, Must, mean, I, need,...
3    [VirginAmerica, really, aggressive, blast, obn...
4             [VirginAmerica, really, big, bad, thing]
5    [VirginAmerica, seriously, would, pay, 30, fli...
6    [VirginAmerica, yes, nearly, every, time, I, f...
7    [VirginAmerica, Really, missed, prime, opportu...
8              [virginamerica, Well, I, NOW, I, DO, D]
9    [VirginAmerica, amazing, arrived, hour, early,...
Name: text, dtype: object

# Stemming & Lemmatizing

Shorten words back to their root form.

In [None]:
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def word_lemmatizer(text):
  lem_text = [lemmatizer.lemmatize(i) for i in text]
  return lem_text

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
df['text'].apply(lambda x: word_lemmatizer(x))

0                    [VirginAmerica, What, dhepburn, said]
1        [VirginAmerica, plus, added, commercial, exper...
2        [VirginAmerica, I, today, Must, mean, I, need,...
3        [VirginAmerica, really, aggressive, blast, obn...
4                 [VirginAmerica, really, big, bad, thing]
                               ...                        
14635    [AmericanAir, thank, got, different, flight, C...
14636    [AmericanAir, leaving, 20, minute, Late, Fligh...
14637    [AmericanAir, Please, bring, American, Airline...
14638    [AmericanAir, money, change, flight, answer, p...
14639    [AmericanAir, 8, ppl, need, 2, know, many, sea...
Name: text, Length: 14640, dtype: object

In [None]:
stemmer = PorterStemmer()

In [None]:
def word_stemmer(text):
  stem_text = " ".join([stemmer.stem(i) for i in text])
  return stem_text

In [None]:
df['text'] = df['text'].apply(lambda x : word_stemmer(x))

In [None]:
df['text']

0                         virginamerica what dhepburn said
1               virginamerica plu ad commerci experi tacki
2        virginamerica I today must mean I need take an...
3        virginamerica realli aggress blast obnoxi ente...
4                       virginamerica realli big bad thing
                               ...                        
14635          americanair thank got differ flight chicago
14636    americanair leav 20 minut late flight No warn ...
14637    americanair pleas bring american airlin blackb...
14638    americanair money chang flight answer phone an...
14639    americanair 8 ppl need 2 know mani seat next f...
Name: text, Length: 14640, dtype: object

In [None]:
df.to_csv(r'/content/gdrive/My Drive/Data Science TDA Group Project/Task2/Cleaning/Cleaned.csv', index = False)


#Removing Data With Low Confidence



In [None]:
df = df.drop(df.query('airline_sentiment_confidence < 0.6').index)

In [None]:
df.to_csv(r'/content/gdrive/My Drive/Data Science TDA Group Project/Task2/Cleaning/CleanedSentimentAnalysis.csv', index = False)

In [None]:
df.shape

(14402, 15)

# Business Insights

In [None]:
wordDict = {}

counter = 0;
index = 0;



for y in df['airline_sentiment']:
  
  
  if y == 'negative':
    text = df.iloc[index,10]
    text = text.split()
    counter += 1 

    for i in range(len(text)):
      if text[i] in wordDict:
        wordDict[text[i]] += 1
      else:
        wordDict[text[i]] = 1
    
  
  index += 1
      

print(counter)
print(sorted(wordDict.items(), key=lambda x: x[1], reverse=True))


9113
[('I', 4315), ('flight', 3598), ('unit', 2876), ('usairway', 2365), ('americanair', 2105), ('southwestair', 1203), ('get', 1191), ('hour', 1100), ('jetblu', 1036), ('cancel', 925), ('delay', 914), ('custom', 781), ('servic', 777), ('time', 758), ('2', 733), ('help', 724), ('call', 701), ('wait', 669), ('bag', 663), ('hold', 653), ('plane', 597), ('amp', 501), ('still', 492), ('tri', 481), ('need', 476), ('flightl', 465), ('day', 451), ('one', 449), ('co', 447), ('http', 444), ('go', 442), ('gate', 424), ('fli', 409), ('phone', 397), ('airlin', 394), ('thank', 385), ('seat', 381), ('late', 378), ('back', 377), ('us', 376), ('would', 375), ('miss', 360), ('agent', 352), ('book', 346), ('3', 344), ('pleas', 328), ('make', 323), ('check', 322), ('chang', 319), ('minut', 315), ('you', 310), ('like', 306), ('4', 302), ('got', 298), ('min', 293), ('hr', 283), ('No', 283), ('never', 283), ('work', 279), ('today', 271), ('take', 270), ('1', 267), ('say', 264), ('even', 260), ('airport', 26

In [None]:
wordDict = {}

counter = 0;
index = 0;



for y in df['airline_sentiment']:
  
  
  if y == 'negative':
    text = df.iloc[index,10]
    text = text.split()
    
    if 'custom' in text and 'servic' in text:
      

      counter += 1 
      for i in range(len(text)):
        if text[i] in wordDict:
          wordDict[text[i]] += 1
        else:
          wordDict[text[i]] = 1
    
  
  index += 1
      

print(counter)
print(sorted(wordDict.items(), key=lambda x: x[1], reverse=True))

468
[('custom', 490), ('servic', 487), ('I', 201), ('unit', 146), ('usairway', 144), ('flight', 103), ('americanair', 101), ('southwestair', 71), ('call', 54), ('worst', 52), ('hold', 49), ('hour', 40), ('get', 39), ('help', 36), ('ever', 36), ('phone', 35), ('terribl', 35), ('cancel', 33), ('jetblu', 32), ('wait', 32), ('poor', 30), ('line', 30), ('2', 30), ('airlin', 29), ('agent', 28), ('amp', 28), ('delay', 27), ('tri', 27), ('time', 23), ('today', 23), ('you', 22), ('rep', 22), ('never', 21), ('would', 21), ('still', 20), ('bag', 20), ('thank', 19), ('day', 19), ('No', 18), ('gate', 18), ('your', 18), ('rude', 18), ('min', 17), ('need', 17), ('experi', 17), ('minut', 17), ('guy', 16), ('lost', 16), ('one', 16), ('peopl', 16), ('bad', 16), ('flightl', 16), ('even', 15), ('know', 15), ('last', 15), ('fli', 15), ('http', 15), ('co', 15), ('horribl', 15), ('respons', 14), ('airport', 14), ('suck', 14), ('go', 13), ('speak', 13), ('like', 13), ('hr', 13), ('pleas', 13), ('staff', 13), 

In [None]:
wordDict = {}

counter = 0;
index = 0;



for y in df['airline_sentiment']:
  
  
  if y == 'negative':
    text = df.iloc[index,10]
    text = text.split()
    
    if 'custom' in text and 'servic' in text:
      if 'hour' in text or 'wait' in text or 'hold' in text or 'time' in text or 'minut' in text or 'long' in text or 'call' in text:
      
        counter += 1 
        for i in range(len(text)):
          if text[i] in wordDict:
            wordDict[text[i]] += 1
          else:
            wordDict[text[i]] = 1
    
  
  index += 1
      

print(counter)
print(sorted(wordDict.items(), key=lambda x: x[1], reverse=True))

149
[('custom', 157), ('servic', 154), ('I', 77), ('call', 54), ('usairway', 50), ('hold', 49), ('hour', 40), ('flight', 37), ('unit', 33), ('wait', 32), ('southwestair', 31), ('americanair', 31), ('time', 23), ('line', 20), ('minut', 17), ('min', 15), ('cancel', 15), ('phone', 14), ('tri', 14), ('2', 14), ('help', 13), ('worst', 12), ('get', 12), ('terribl', 11), ('agent', 11), ('delay', 10), ('thank', 10), ('amp', 10), ('you', 10), ('back', 9), ('hr', 9), ('still', 8), ('suck', 8), ('flightl', 8), ('No', 8), ('bag', 7), ('rep', 7), ('peopl', 7), ('hung', 7), ('30', 7), ('speak', 7), ('1', 7), ('jetblu', 7), ('long', 6), ('not', 6), ('talk', 6), ('book', 6), ('plane', 6), ('today', 6), ('been', 6), ('disconnect', 6), ('got', 6), ('40', 5), ('3', 5), ('your', 5), ('5', 5), ('airlin', 5), ('busi', 5), ('need', 5), ('day', 5), ('rude', 5), ('say', 5), ('storm', 5), ('answer', 5), ('issu', 5), ('6', 5), ('pleas', 5), ('how', 5), ('keep', 5), ('sinc', 5), ('thi', 5), ('w', 5), ('connect', 