In [0]:
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
import pandas as pd

### import data from csv into a pandas dataframe

Data from: https://www.kaggle.com/crowdflower/twitter-airline-sentiment

In [0]:
kaggle_df = pd.read_csv('twitter-airline-sentiment/tweets.csv', header=0)
kaggle_df.shape

(14640, 15)

In [0]:
stanford_df = pd.read_csv('sentiment140_stanford-dataset/test_data.csv', header=None)
translate_polarity = lambda x: 'negative' if x == 0 else ('neutral' if x == 2 else 'positive')
stanford_df["sentiment"] = stanford_df[0]
stanford_df["text"] = stanford_df[5]
for label, row in stanford_df.iterrows(): # translating numbers in the original csv file
        stanford_df.loc[label, 'sentiment'] = translate_polarity(row['sentiment'])
stanford_df.shape

(498, 8)

### divide data: 80% for training and 20% for testing

In [0]:
train = kaggle_df.loc[:11711,["text", "airline_sentiment"]]
test = kaggle_df.loc[11712:,["text", "airline_sentiment"]]
print(train.shape, test.shape)
train.head()

(11712, 2) (2928, 2)


Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [0]:
stanford_test = stanford_df.loc[:,["text", "sentiment"]]
print(stanford_test.shape)
stanford_test.head()

(498, 2)


Unnamed: 0,text,sentiment
0,@stellargirl I loooooooovvvvvveee my Kindle2. ...,positive
1,Reading my kindle2... Love it... Lee childs i...,positive
2,"Ok, first assesment of the #kindle2 ...it fuck...",positive
3,@kenburbary You'll love your Kindle2. I've had...,positive
4,@mikefish Fair enough. But i have the Kindle2...,positive


### create a list of punctuation symbols and stop words

In [0]:
import string
from nltk.corpus import stopwords
english_stops = stopwords.words('english')
english_stops += list(string.punctuation)
english_stops += ['``', "''", "'s",'’',"n't",\
                  '“','”',"'re","'ve","'m",\
                  "'d","'ll","http","https",\
                  '...','w/','w/o',"--","I"]

### clean data by passing the prospective dataframe

In [0]:
def clean_data(df):
    
    clean_tweets = []
    clean_words = []
    
    for label, row in df.iterrows():
        words = word_tokenize(row['text'])
        clean_words = []
        
        if "RT @" in row['text'] or len(row['text']) < 20: # filtering tweets: removing retweets & short tweets
            df.drop([label], inplace=True)
            continue
        else:
            for idx, word in enumerate(words):
                if word in english_stops:
                    if word == ('@'): 
                        words.pop(idx+1) # remove all usernames
                    elif word == ('http') or word == ('https'):
                        words.pop(idx+2) # remove all urls
                    continue
                clean_words.append(word)
                
        clean_tweets.append(' '.join(clean_words))
        
    return clean_tweets

### clean, fit and transform training tweets

In [0]:
clean_training_tweets = clean_data(train)
vectorizer = CountVectorizer(analyzer="word")
features = vectorizer.fit_transform(clean_training_tweets)
features.shape

(11500, 11613)

### clean test tweets then transform them using the same vectorizer

In [0]:
clean_test_tweets = clean_data(test) # the clean tweets of the first dataset
test_features = vectorizer.transform(clean_test_tweets)
test_features.shape

(2910, 11613)

In [0]:
stanford_tweets = clean_data(stanford_test) # the clean tweets of the second dataset
stanford_features = vectorizer.transform(stanford_tweets)
stanford_features.shape

(475, 11613)

### train using random forest

In [0]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=0)
random_forest = random_forest.fit(features, train["airline_sentiment"])

### predict using random forest

In [0]:
rf_predictions = random_forest.predict(test_features)
rf_df = pd.DataFrame(data={"text":test["text"], "sentiment":rf_predictions})
rf_df.to_csv('rf_results.csv', index=False)

print('Accuracy on the 1st dataset: %0.2f' % (metrics.f1_score(test["airline_sentiment"], rf_predictions, average='micro')))

Accuracy on the 1st dataset: 0.80


In [0]:
stanford_rf_predictions = random_forest.predict(stanford_features)
stanford_rf_df = pd.DataFrame(data={"text":stanford_test["text"], "sentiment":stanford_rf_predictions})
stanford_rf_df.to_csv('stanford_rf_results.csv', index=False)

print('Accuracy on the 2st dataset: %0.2f' % (metrics.f1_score(stanford_test["sentiment"], stanford_rf_predictions, average='micro')))

Accuracy on the 2st dataset: 0.54


### train using naive-bayes

In [0]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB(alpha=1)
naive_bayes = naive_bayes.fit(features, train["airline_sentiment"])

### predict using naive-bayes

In [0]:
nb_predictions = naive_bayes.predict(test_features)
nb_df = pd.DataFrame(data={"text":test["text"], "sentiment":nb_predictions})
nb_df.to_csv('nb_results.csv', index=False)

print('Accuracy on the 1st dataset: %0.2f' % (metrics.f1_score(test["airline_sentiment"], nb_predictions, average='micro')))

Accuracy on the 1st dataset: 0.80


In [0]:
stanford_nb_predictions = naive_bayes.predict(stanford_features)
stanford_nb_df = pd.DataFrame(data={"text":stanford_test["text"], "sentiment":stanford_nb_predictions})
stanford_nb_df.to_csv('stanford_nb_results.csv', index=False)

print('Accuracy on the 2st dataset: %0.2f' % (metrics.f1_score(stanford_test["sentiment"], stanford_nb_predictions, average='micro')))

Accuracy on the 2st dataset: 0.56


### train using k-nearest neighbour

In [0]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn = knn.fit(features, train["airline_sentiment"])

### predict using k-nearest neighbour

In [0]:
knn_predictions = knn.predict(test_features)
knn_df = pd.DataFrame(data={"text":test["text"], "sentiment":knn_predictions})
knn_df.to_csv('knn_results.csv', index=False)

print('Accuracy on the 1st dataset: %0.2f' % (metrics.f1_score(test["airline_sentiment"], knn_predictions, average='micro')))

Accuracy on the 1st dataset: 0.60


In [0]:
stanford_knn_predictions = knn.predict(stanford_features)
stanford_knn_df = pd.DataFrame(data={"text":stanford_test["text"], "sentiment":stanford_knn_predictions})
stanford_knn_df.to_csv('stanford_knn_results.csv', index=False)

print('Accuracy on the 2st dataset: %0.2f' % (metrics.f1_score(stanford_test["sentiment"], stanford_knn_predictions, average='micro')))

Accuracy on the 2st dataset: 0.44
