In [64]:
import numpy as np
import pandas as pd
import re

In [65]:
dataset = pd.read_csv('twitter_data.csv')
dataset.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [66]:
dataset = dataset[['airline_sentiment', 'text']]
dataset['text'] = dataset['text'].str.replace(r'VirginAmerica', '', regex=True)
dataset['text'] = dataset['text'].str.replace(r'AmericanAir', '', regex=True)
dataset['text'] = dataset['text'].str.replace(r'SouthwestAir', '', regex=True)
dataset['text'] = dataset['text'].str.replace(r'USAirways', '', regex=True)
dataset['text'] = dataset['text'].str.replace(r'JetBlue', '', regex=True)
dataset.head(2)

Unnamed: 0,airline_sentiment,text
0,neutral,@ What @dhepburn said.
1,positive,@ plus you've added commercials to the experie...


In [67]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
dataset['airline_sentiment'] = encoder.fit_transform(dataset['airline_sentiment'])

In [68]:
dataset['airline_sentiment'].head(2)

0    1
1    2
Name: airline_sentiment, dtype: int32

In [69]:
dataset.isna().sum()

airline_sentiment    0
text                 0
dtype: int64

In [70]:
# Remove rows where review column does not contain alphabets
dataset = dataset[dataset['text'].apply(lambda x: bool(re.search('[a-zA-Z]', str(x))))]

# Reset index after dropping
dataset = dataset.reset_index(drop=True)

In [71]:
dataset.shape

(14640, 2)

In [72]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 14640):
  review = re.sub('[^a-zA-Z]', ' ', dataset['text'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to C:\Users\Vijayabhaskar
[nltk_data]     V\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [73]:
print(corpus)

['dhepburn said', 'plu ad commerci experi tacki', 'today must mean need take anoth trip', 'realli aggress blast obnoxi entertain guest face amp littl recours', 'realli big bad thing', 'serious would pay flight seat play realli bad thing fli va', 'ye nearli everi time fli vx ear worm go away', 'realli miss prime opportun men without hat parodi http co mwpg grezp', 'virginamerica well', 'amaz arriv hour earli good', 'know suicid second lead caus death among teen', 'lt pretti graphic much better minim iconographi', 'great deal alreadi think nd trip australia amp even gone st trip yet p', 'virginmedia fli fabul seduct sky u take stress away travel http co ahlxhhkiyn', 'thank', 'sfo pdx schedul still mia', 'excit first cross countri flight lax mco heard noth great thing virgin america daystogo', 'flew nyc sfo last week fulli sit seat due two larg gentleman either side help', 'fli', 'know would amazingli awesom bo fll pleas want fli', 'first fare may three time carrier seat avail select', 'l

In [74]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 0].values

In [75]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [76]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100, random_state = 0)
classifier.fit(X_train, y_train)

In [77]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[2 2]
 [0 0]
 [0 0]
 ...
 [1 1]
 [0 1]
 [0 0]]


In [78]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[1685  144   60]
 [ 256  268   56]
 [ 110   54  295]]


0.76775956284153

In [79]:
new_input = ["@virginamerica Well, I didn'tâ€¦but NOW I DO! :-D"]
new_input_vectorized = cv.transform(new_input).toarray()

new_prediction = classifier.predict(new_input_vectorized)

print(f"Predicted class: {new_prediction[0]}")

Predicted class: 2
