## Import Libraries

In [2]:
import re
import sklearn
import numpy as np
import pandas as pd
import preprocessor as p

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

## Import Datasets

In [None]:
# training data
train = pd.read_csv("SentimentDataset_train.csv")

# testing data
test = pd.read_csv("SentimentDataset_test.csv")

## Data Analysis

In [None]:
train.head()

In [None]:
test.tail()

In [None]:
# good sentiment related tweets
sum(train["label"] == 0)

In [None]:
# bad sentiment related tweets
sum(train["label"] == 1)

In [None]:
# check if there are any missing values
train.isnull().sum()

## Data Cleaning

In [None]:
#set up punctuations we want to be replaced
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")

In [None]:
def clean_tweets(df):
    tempArr = []
    for line in df:
        # clean using tweet_preprocessor
        tmpL = p.clean(line)
        # remove all punctuation
        tmpL = REPLACE_NO_SPACE.sub("", tmpL.lower())
        tmpL = REPLACE_WITH_SPACE.sub(" ", tmpL)
        tempArr.append(tmpL)
    return tempArr

In [None]:
# clean training data
train_tweet = clean_tweets(train["tweet"])
train_tweet = pd.DataFrame(train_tweet)

In [None]:
# append cleaned tweets to the training dataset
train["clean_tweet"] = train_tweet

# display the new dataset
train.head()

In [None]:
# clean training data
test_tweet = clean_tweets(test["tweet"])
test_tweet = pd.DataFrame(test_tweet)

In [None]:
# append cleaned tweets to the test dataset
test["clean_tweet"] = test_tweet

# display the new dataset
test.tail()

## Train and Test Split

In [None]:
# extract the labels from the train data
y = train.label.values

# use 70% for the training and 30% for the test
x_train, x_test, y_train, y_test = train_test_split(train.clean_tweet.values, y,
                                                   stratify = y,
                                                   random_state = 1,
                                                   test_size = 0.3,
                                                   shuffle = True)

## Vectorize tweets using CountVectorizer()

In [None]:
# initilizing the CountVectorizer()
vectorizer = CountVectorizer(binary = True, stop_words = "english")

# learn a vocabulary dictionary of all tokens in the raw documents
vectorizer.fit(list(x_train) + list(x_test))

# transfrom documents to document-term matrix
x_train_vec = vectorizer.transform(x_train)
x_test_vec = vectorizer.transform(x_test)

## Model Building

In [None]:
from sklearn import svm

# We shall apply Support Vector Classifier (SVC)
svm = svm.SVC(kernel = "linear", probability = True)

# fit the SVC model based on the given training data
svm.fit(x_train_vec, y_train).predict_proba(x_test_vec)

# perform classification and prediction on samples in x_test
y_pred_svm = svm.predict(x_test_vec)

## Accuracy score for SVC

In [None]:
print("Accuracy score for SVC is:", accuracy_score(y_test, y_pred_svm) * 100, "%")

## Saving the model

In [None]:
import pickle

# We will use Pickel package to assist us saving our model as a file
filename = 'moody_sentiment_model.sav'
pickle.dump(svm, open(filename, 'wb'))