In [1]:
import pandas as pd
import numpy as np

## Get Data
#### Airline review tweets

In [2]:
data_source_url = "https://raw.githubusercontent.com/kolaveridi/kaggle-Twitter-US-Airline-Sentiment-/master/Tweets.csv"
airline_tweets = pd.read_csv(data_source_url)

airline_tweets.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


## Basic Feature Extraction
### Count Words

In [3]:
airline_tweets['word_count'] = airline_tweets['text'].apply(lambda x: len(str(x).split(" ")))
airline_tweets[['text', 'word_count']].head()

Unnamed: 0,text,word_count
0,@VirginAmerica What @dhepburn said.,4
1,@VirginAmerica plus you've added commercials t...,9
2,@VirginAmerica I didn't today... Must mean I n...,12
3,@VirginAmerica it's really aggressive to blast...,17
4,@VirginAmerica and it's a really big bad thing...,10


### Count Chars

In [4]:
airline_tweets['char_count'] = airline_tweets['text'].str.len()  # Includes spaces
airline_tweets[['text', 'char_count']].head()

Unnamed: 0,text,char_count
0,@VirginAmerica What @dhepburn said.,35
1,@VirginAmerica plus you've added commercials t...,72
2,@VirginAmerica I didn't today... Must mean I n...,71
3,@VirginAmerica it's really aggressive to blast...,126
4,@VirginAmerica and it's a really big bad thing...,55


### Calculate average word length

In [5]:
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

airline_tweets['avg_word'] = airline_tweets['text'].apply(lambda x: avg_word(x))
airline_tweets[['text', 'avg_word']].head()

Unnamed: 0,text,avg_word
0,@VirginAmerica What @dhepburn said.,8.0
1,@VirginAmerica plus you've added commercials t...,7.111111
2,@VirginAmerica I didn't today... Must mean I n...,5.0
3,@VirginAmerica it's really aggressive to blast...,6.470588
4,@VirginAmerica and it's a really big bad thing...,4.6


### Count stopwords

In [6]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

airline_tweets['stopwords'] = airline_tweets['text'].apply(lambda x: len([x for x in x.split() if x in stop]))
airline_tweets[['text', 'stopwords']].head()

Unnamed: 0,text,stopwords
0,@VirginAmerica What @dhepburn said.,0
1,@VirginAmerica plus you've added commercials t...,3
2,@VirginAmerica I didn't today... Must mean I n...,2
3,@VirginAmerica it's really aggressive to blast...,6
4,@VirginAmerica and it's a really big bad thing...,5


### Count hashtags

In [7]:
airline_tweets['hashtags'] = airline_tweets['text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
airline_tweets[['text', 'hashtags']].head()

Unnamed: 0,text,hashtags
0,@VirginAmerica What @dhepburn said.,0
1,@VirginAmerica plus you've added commercials t...,0
2,@VirginAmerica I didn't today... Must mean I n...,0
3,@VirginAmerica it's really aggressive to blast...,0
4,@VirginAmerica and it's a really big bad thing...,0


### Count numbers

In [8]:
airline_tweets['numerics'] = airline_tweets['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
airline_tweets[['text', 'numerics']].head()

Unnamed: 0,text,numerics
0,@VirginAmerica What @dhepburn said.,0
1,@VirginAmerica plus you've added commercials t...,0
2,@VirginAmerica I didn't today... Must mean I n...,0
3,@VirginAmerica it's really aggressive to blast...,0
4,@VirginAmerica and it's a really big bad thing...,0


### Count ALLCAPS

In [9]:
airline_tweets['upper'] = airline_tweets['text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
airline_tweets[['text', 'upper']].head()

Unnamed: 0,text,upper
0,@VirginAmerica What @dhepburn said.,0
1,@VirginAmerica plus you've added commercials t...,0
2,@VirginAmerica I didn't today... Must mean I n...,2
3,@VirginAmerica it's really aggressive to blast...,0
4,@VirginAmerica and it's a really big bad thing...,0


## Basic Pre-processing
### Convert to Lower Case

In [10]:
airline_tweets['text'] = airline_tweets['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
airline_tweets['text'].head()

0                  @virginamerica what @dhepburn said.
1    @virginamerica plus you've added commercials t...
2    @virginamerica i didn't today... must mean i n...
3    @virginamerica it's really aggressive to blast...
4    @virginamerica and it's a really big bad thing...
Name: text, dtype: object

### Remove Punctuation

In [11]:
airline_tweets['text'] = airline_tweets['text'].str.replace('[^\w\s]', '')
airline_tweets['text'].head()

0                     virginamerica what dhepburn said
1    virginamerica plus youve added commercials to ...
2    virginamerica i didnt today must mean i need t...
3    virginamerica its really aggressive to blast o...
4    virginamerica and its a really big bad thing a...
Name: text, dtype: object

### Remove Stopwords

In [12]:
airline_tweets['text'] = airline_tweets['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
airline_tweets['text'].head()

0                          virginamerica dhepburn said
1    virginamerica plus youve added commercials exp...
2    virginamerica didnt today must mean need take ...
3    virginamerica really aggressive blast obnoxiou...
4                   virginamerica really big bad thing
Name: text, dtype: object

### Remove Words that Occur Commonly in our Dataset
#### Find ten most frequent words

In [13]:
freq = pd.Series(' '.join(airline_tweets['text']).split()).value_counts()[:10]
freq

united          4143
flight          3873
usairways       3051
americanair     2957
southwestair    2452
jetblue         2361
get             1334
thanks          1072
cancelled       1056
service          956
dtype: int64

#### Remove these words

In [14]:
freq = list(freq.index)
airline_tweets['text'] = airline_tweets['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
airline_tweets['text'].head()

0                          virginamerica dhepburn said
1    virginamerica plus youve added commercials exp...
2    virginamerica didnt today must mean need take ...
3    virginamerica really aggressive blast obnoxiou...
4                   virginamerica really big bad thing
Name: text, dtype: object

### Remove Uncommon Words as Well

In [15]:
freq = pd.Series(' '.join(airline_tweets['text']).split()).value_counts()[-10000:]
freq

1205                  1
pleasecomeback        1
4994                  1
1035                  1
cbssoxfan             1
                     ..
bobbi                 1
nashvilledenver       1
httpstcoyx1dqjn8nl    1
longstanding          1
md80dc10              1
Length: 10000, dtype: int64

In [16]:
freq = list(freq.index)
airline_tweets['text'] = airline_tweets['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
airline_tweets['text'].head()

0                                   virginamerica said
1    virginamerica plus youve added commercials exp...
2    virginamerica didnt today must mean need take ...
3    virginamerica really aggressive blast obnoxiou...
4                   virginamerica really big bad thing
Name: text, dtype: object

### Lemmatize

In [17]:
from textblob import Word

airline_tweets['text'] = airline_tweets['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
airline_tweets['text'].head()

0                                   virginamerica said
1    virginamerica plus youve added commercial expe...
2    virginamerica didnt today must mean need take ...
3    virginamerica really aggressive blast obnoxiou...
4                   virginamerica really big bad thing
Name: text, dtype: object


## Sentiment Analysis
### Check first five tweets

In [18]:
from textblob import TextBlob

airline_tweets['text'][:5].apply(lambda x: TextBlob(x).sentiment)

0                                   (0.0, 0.0)
1                                   (0.0, 0.0)
2                            (-0.3125, 0.6875)
3                (0.0062500000000000056, 0.35)
4    (-0.3499999999999999, 0.3833333333333333)
Name: text, dtype: object

(polarity, subjectivity)

### Extract polarity and Compare to Dataset Analysis

In [19]:
airline_tweets['sentiment'] = airline_tweets['text'].apply(lambda x: TextBlob(x).sentiment[0])
airline_tweets[['text', 'sentiment', 'airline_sentiment']].head(20)

Unnamed: 0,text,sentiment,airline_sentiment
0,virginamerica said,0.0,neutral
1,virginamerica plus youve added commercial expe...,0.0,positive
2,virginamerica didnt today must mean need take ...,-0.3125,neutral
3,virginamerica really aggressive blast obnoxiou...,0.00625,negative
4,virginamerica really big bad thing,-0.35,negative
5,virginamerica seriously would pay 30 seat didn...,-0.516667,negative
6,virginamerica yes nearly every time fly vx ear...,0.45,positive
7,virginamerica really missed prime opportunity ...,0.2,neutral
8,virginamerica well,0.0,positive
9,virginamerica amazing arrived hour early youre...,0.466667,positive
