In [1]:
import pandas as pd
import numpy as np
import nltk
import re

### Preprocessing

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.head()

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,negative
2,088c60f138,my boss is bullying me...,negative
3,9642c003ef,what interview! leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",negative


In [4]:
test.head()

Unnamed: 0,textID,text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh
1,96d74cb729,Shanghai is also really exciting (precisely -...
2,eee518ae67,"Recession hit Veronique Branquinho, she has to..."
3,01082688c6,happy bday!
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!


Drop rows where "text" is empty.

In [5]:
train.dropna(subset = ["text"], inplace = True)
test.dropna(subset = ["text"], inplace = True)

Remove punctuation.

In [6]:
train.text = train.text.apply(lambda x: re.sub(r"[^\w\s]", "", x))
test.text = test.text.apply(lambda x: re.sub(r"[^\w\s]", "", x))

Remove stopwords.

In [7]:
from nltk.corpus import stopwords

stopwords = stopwords.words("english")
train.text = train.text.apply(lambda x: " ".join([word for word in nltk.word_tokenize(x) if word.lower() not in stopwords]))
test.text = test.text.apply(lambda x: " ".join([word for word in nltk.word_tokenize(x) if word.lower() not in stopwords]))

Remove links.

In [8]:
train.text = train.text.apply(lambda x: re.sub(r"\(?http\S+", "", x))
test.text = test.text.apply(lambda x: re.sub(r"\(?http\S+", "", x))

Lemmatization.

In [9]:
lemmatizer = nltk.WordNetLemmatizer()
train.text = train.text.apply(lambda x: " ".join(
    [lemmatizer.lemmatize(word) for word in nltk.word_tokenize(x)]
))
test.text = test.text.apply(lambda x: " ".join(
    [lemmatizer.lemmatize(word) for word in nltk.word_tokenize(x)]
))

Drop rows where "text" is empty, again.

In [10]:
train.dropna(subset = ["text"], inplace = True)
test.dropna(subset = ["text"], inplace = True)

### Naive Bayes sentiment classifier

In [11]:
# from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import metrics

We use CountVectorizer to help us create a matrix for machine learning.

In [12]:
vectorizer = CountVectorizer()

# Features matrix for train
train_x = vectorizer.fit_transform(train.text)

# Fit the same CountVectorizer on my test data
# We need the features matrix to be the same for the test data
test_x = vectorizer.transform(test.text)

In [13]:
# Check that it is the same size features_matrix
train_x.shape

(27480, 26934)

In [14]:
# Same number of columns
test_x.shape

(3534, 26934)

We instantiate a multinomial naive Bayes model, then fit it.

In [15]:
nb = MultinomialNB()

# nb.fit(features_matrix, item_we_want_to_predict - y values)
nb.fit(train_x, train.sentiment)

MultinomialNB()

In [16]:
# Predicting on the "test" dataset
predictions_test = nb.predict(test_x)

In [17]:
# Predicting on the original "train" dataset
predictions_train = nb.predict(train_x)

In [18]:
# Accuracy score for predicting on the "train" dataset
metrics.accuracy_score(train.sentiment, predictions_train)

0.8368995633187772

In [30]:
# Dropping the "text" column from the test dataset
test = test.drop(columns = ["text"])

In [31]:
# Adding the sentiment predictions as a new column to the test dataset
test["sentiment"] = predictions_test

In [33]:
# Writing this to a csv
test.to_csv("kaggle_submission.csv", index = False)