# Problem Statement
```
Sentiment analysis remains one of the key problems that has seen extensive application of natural language processing. This time around, given the tweets from customers about various tech firms who manufacture and sell mobiles, computers, laptops, etc, the task is to identify if the tweets have a negative sentiment towards such companies or products.
```

### My Rank - 475
### Score - 0.7867932144

In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn.metrics import accuracy_score, f1_score

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [3]:
import re
from nltk import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

lemm = WordNetLemmatizer()

#Clean text from noise
def clean_text(text):
    #Filter to allow only alphabets
    text = re.sub(r'[^a-zA-Z\']', ' ', text)
    
    #Remove Unicode characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    #Convert to lowercase to maintain consistency
    text = text.lower()
       
    return text

# corpus = []
# for i in range(len(train)):
#     #Filter to allow only alphabets
#     text = re.sub(r'[^a-zA-Z\']', ' ', train['tweet'][i])
    
#     #Remove Unicode characters
#     text = re.sub(r'[^\x00-\x7F]+', '', train['tweet'][i])
    
#     #Convert to lowercase to maintain consistency
#     text = text.lower()
    
#     text = word_tokenize(text)
#     text = [lemm.lemmatize(word) for word in text if not word in stopwords.words('english')]
    
#     corpus.append(text)

In [4]:
train['clean_tweet'] = train['tweet'].apply(lambda x: clean_text(x))
test['clean_tweet'] = test['tweet'].apply(lambda x: clean_text(x))
train.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,fingerprint pregnancy test https goo gl h ...
1,2,0,Finally a transparant silicon case ^^ Thanks t...,finally a transparant silicon case thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...,we love this would you go talk makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...,i'm wired i know i'm george i was made that wa...
4,5,1,What amazing service! Apple won't even talk to...,what amazing service apple won't even talk to...


In [5]:
#Exhaustive list of stopwords in the english language. We want to focus less on these so at some point will have to filter
from nltk.corpus import stopwords
 
STOP_WORDS = set(stopwords.words('english'))

#Generate word frequency
def gen_freq(text):
    #Will store the list of words
    word_list = []

    #Loop over all the tweets and extract words into word_list
    for tw_words in text.split():
        word_list.extend(tw_words)

    #Create word frequencies using word_list
    word_freq = pd.Series(word_list).value_counts()
    
    #Drop the stopwords during the frequency calculation
    word_freq = word_freq.drop(STOP_WORDS, errors='ignore')
    
    #Print top 20 words
    #word_freq[:20]
    
    return word_freq
word_freq = gen_freq(train['clean_tweet'].str)

In [6]:
#Import libraries
import matplotlib.pyplot as plt
from wordcloud import WordCloud

#Generate word cloud
wc = WordCloud(width=400, height=330, max_words=100, background_color='white').generate_from_frequencies(word_freq)

plt.figure(figsize=(12, 8))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

<Figure size 1200x800 with 1 Axes>

In [7]:
print(word_freq)

iphone                        4313
http                          3442
com                           3389
apple                         2974
p                             2784
instagram                     2187
samsung                       1441
twitter                       1262
new                           1160
https                         1014
phone                          998
sony                           855
instagr                        789
follow                         767
www                            622
pic                            610
ipad                           538
like                           525
love                           465
life                           425
android                        422
ios                            409
rt                             388
day                            386
ly                             363
cute                           338
photo                          330
photography                    326
case                

In [8]:
text = train.clean_tweet.str
word_list = []

#Loop over all the tweets and extract words into word_list
for tw_words in text.split():
    word_list.extend(tw_words)

In [9]:
word_freq = gen_freq(train.clean_tweet.str)
#100 most rare words in the dataset
rare_100 = word_freq[-100:]

train['word_count'] = train['clean_tweet'].str.split().apply(lambda x: len(x))

word_freq = gen_freq(test.clean_tweet.str)
#100 most rare words in the dataset
rare_100 = word_freq[-100:]

test['word_count'] = test['clean_tweet'].str.split().apply(lambda x: len(x))

train.head()

Unnamed: 0,id,label,tweet,clean_tweet,word_count
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,fingerprint pregnancy test https goo gl h ...,17
1,2,0,Finally a transparant silicon case ^^ Thanks t...,finally a transparant silicon case thanks t...,21
2,3,0,We love this! Would you go? #talk #makememorie...,we love this would you go talk makememorie...,19
3,4,0,I'm wired I know I'm George I was made that wa...,i'm wired i know i'm george i was made that wa...,22
4,5,1,What amazing service! Apple won't even talk to...,what amazing service apple won't even talk to...,22


In [10]:
#Check whether a negation term is present in the text
def any_neg(words):
    for word in words:
        if word in ['n', 'no', 'non', 'not'] or re.search(r"\wn't", word):
            return 1
    else:
        return 0

#Check whether one of the 100 rare words is present in the text
def any_rare(words, rare_100):
    for word in words:
        if word in rare_100:
            return 1
    else:
        return 0

#Check whether prompt words are present
def is_question(words):
    for word in words:
        if word in ['when', 'what', 'how', 'why', 'who']:
            return 1
    else:
        return 0

In [11]:
#Negation present or not
train['any_neg'] = train.clean_tweet.str.split().apply(lambda x: any_neg(x))
#Prompt present or not
train['is_question'] = train.clean_tweet.str.split().apply(lambda x: is_question(x))

#Negation present or not
test['any_neg'] = test.clean_tweet.str.split().apply(lambda x: any_neg(x))
#Prompt present or not
test['is_question'] = test.clean_tweet.str.split().apply(lambda x: is_question(x))

train.head()

Unnamed: 0,id,label,tweet,clean_tweet,word_count,any_neg,is_question
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,fingerprint pregnancy test https goo gl h ...,17,0,0
1,2,0,Finally a transparant silicon case ^^ Thanks t...,finally a transparant silicon case thanks t...,21,0,0
2,3,0,We love this! Would you go? #talk #makememorie...,we love this would you go talk makememorie...,19,1,0
3,4,0,I'm wired I know I'm George I was made that wa...,i'm wired i know i'm george i was made that wa...,22,0,0
4,5,1,What amazing service! Apple won't even talk to...,what amazing service apple won't even talk to...,22,1,1


In [12]:
#Any of the most 100 rare words present or not
train['any_rare'] = train.clean_tweet.str.split().apply(lambda x: any_rare(x, rare_100))
#Character count of the tweet
train['char_count'] = train.clean_tweet.apply(lambda x: len(x))

#Any of the most 100 rare words present or not
test['any_rare'] = test.clean_tweet.str.split().apply(lambda x: any_rare(x, rare_100))
#Character count of the tweet
test['char_count'] = test.clean_tweet.apply(lambda x: len(x))

In [13]:
train.head()

Unnamed: 0,id,label,tweet,clean_tweet,word_count,any_neg,is_question,any_rare,char_count
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,fingerprint pregnancy test https goo gl h ...,17,0,0,0,128
1,2,0,Finally a transparant silicon case ^^ Thanks t...,finally a transparant silicon case thanks t...,21,0,0,0,131
2,3,0,We love this! Would you go? #talk #makememorie...,we love this would you go talk makememorie...,19,1,0,0,123
3,4,0,I'm wired I know I'm George I was made that wa...,i'm wired i know i'm george i was made that wa...,22,0,0,0,112
4,5,1,What amazing service! Apple won't even talk to...,what amazing service apple won't even talk to...,22,1,1,0,124


# Data Splitting

In [14]:
from sklearn.model_selection import train_test_split

X = train[['word_count', 'any_neg', 'any_rare', 'char_count', 'is_question']]
y = train.label
test = test[['word_count', 'any_neg', 'any_rare', 'char_count', 'is_question']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=27)

# Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
logr = LogisticRegression().fit(X_train, y_train)
y_pred = logr.predict(X_test)
f1_score(y_test, y_pred)



0.5064935064935063

# Decision Tree

In [16]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier().fit(X_train, y_train)
y_pred = dtree.predict(X_test)
f1_score(y_test, y_pred)

0.42172523961661335

# Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier().fit(X_train, y_train)
y_pred = rf.predict(X_test)
f1_score(y_test, y_pred)



0.453416149068323

# Naive Bayes (GaussianNB)

In [18]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_score(y_test, y_pred)

0.48863636363636365

# Xgboost

In [19]:
from xgboost import XGBClassifier
xg = XGBClassifier()
xg.fit(X_train, y_train)
y_pred = xg.predict(X_test)
f1_score(y_test, y_pred)

0.5757575757575758

# Predicting on test data

In [20]:
xg.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [21]:
T_pred = xg.predict(test)

In [22]:
sub = pd.read_csv('sample_submission.csv')
sub['label'] = T_pred
sub.to_csv('sample_submission.csv', index=False)  # Rank - 475   Score - 78%