# Twitter US Airline Sentiment Analysis

In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
import collections

Download the data.

In [2]:
full_data = pd.read_csv('data/Tweets.csv')
full_data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [3]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
tweet_id                        14640 non-null int64
airline_sentiment               14640 non-null object
airline_sentiment_confidence    14640 non-null float64
negativereason                  9178 non-null object
negativereason_confidence       10522 non-null float64
airline                         14640 non-null object
airline_sentiment_gold          40 non-null object
name                            14640 non-null object
negativereason_gold             32 non-null object
retweet_count                   14640 non-null int64
text                            14640 non-null object
tweet_coord                     1019 non-null object
tweet_created                   14640 non-null object
tweet_location                  9907 non-null object
user_timezone                   9820 non-null object
dtypes: float64(2), int64(2), object(11)
memory usage: 1.7+ MB


Get the relevant columns for sentiment analysis.

In [4]:
data = full_data[['airline_sentiment', 'text']]
data.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


We use remove words that start from with @ as we want to create a model than be applied to other situations as well.

In [5]:
def remove_at_sign(tweet):
    """
    remove words that start with @
    """
    return re.sub(r'@\w+', '', tweet)

In [6]:
data['text'] = data['text'].apply(remove_at_sign)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply(remove_at_sign)


In [7]:
def remove_stop_words(tweet):
    """
    remove stop words in the tweet
    """
    stopword = nltk.corpus.stopwords.words('english')
    words = tweet.split()
    relevant_words = [word for word in words if (word not in stopword)]
    return " ".join(relevant_words) 

In [8]:
data['text'] = data['text'].apply(remove_stop_words)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply(remove_stop_words)


Now we transform the label data into number instead words.

In [9]:
data['airline_sentiment'] = data['airline_sentiment'].map({
                                                    'neutral': 1, 'positive': 2, 'negative': 0
                                                    })

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['airline_sentiment'] = data['airline_sentiment'].map({


In [10]:
data.head()

Unnamed: 0,airline_sentiment,text
0,1,What said.
1,2,plus added commercials experience... tacky.
2,1,I today... Must mean I need take another trip!
3,0,"really aggressive blast obnoxious ""entertainme..."
4,0,really big bad thing


Split the data into train data, test data, and validation_data

In [11]:
X_full_train, X_test, y_full_train, y_test = train_test_split(data['text'], data['airline_sentiment'],
                                                             test_size=0.1)

In [12]:
X_train, X_valid, y_train, y_valid = train_test_split(X_full_train, y_full_train, test_size = 0.2)

Change words into tokens

In [13]:
tokenizers = keras.preprocessing.text.Tokenizer(lower=True, num_words=10000)
tokenizers.fit_on_texts(X_train)

In [14]:
print('Top 5 most common words are:', collections.Counter(tokenizers.word_counts).most_common(5))

Top 5 most common words are: [('i', 3688), ('flight', 2834), ('get', 970), ('t', 890), ('co', 879)]


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 2 columns):
airline_sentiment    14640 non-null int64
text                 14640 non-null object
dtypes: int64(1), object(1)
memory usage: 228.9+ KB


In [16]:
X_train_seq = tokenizers.texts_to_sequences(X_train)
X_valid_seq = tokenizers.texts_to_sequences(X_valid)
X_test_seq = tokenizers.texts_to_sequences(X_test)

Change the text sequences into One Hot Encoding.

In [17]:
def one_hot_encoding(sequence):
    """
    Create One Hot Encoding based to sequences created by tokenizer
    """
    ohs = np.zeros((len(sequence), 10000))
    for i, word in enumerate(sequence):
        ohs[i, word] = 1
    return ohs

In [18]:
X_train_ohe = one_hot_encoding(X_train_seq)
X_valid_ohe = one_hot_encoding(X_valid_seq)
X_test_seq = one_hot_encoding(X_test_seq)

In [19]:
X_train_ohe.shape

(10540, 10000)

## Create ANN model

In [23]:
model = keras.models.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(10000,)),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(3, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [30]:
assert X_valid_ohe.shape[0] == y_valid.shape[0]
assert X_train_ohe.shape[0] == y_train.shape[0]

In [31]:
history = model.fit(X_train_ohe, y_train, validation_data = (X_valid_ohe, y_valid), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [32]:
model.evaluate(X_test_seq, y_test)



[1.5131251811981201, 0.75]