In [2]:
import re

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [7]:
def cleaning(document):
    # remove non-ASCII characters
    result = re.sub(r'[^\x00-\x7F]+',' ', document)
    #I. Removing URL.
    #II. Removing all irrelevant characters (Numbers and Punctuation).
    #III. Convert all characters into lowercase.
    #IV. Tokenization
    #V. Removing Stopwords
    #VI. Stemming and Lemmatization
    #VII. Remove the words having length <= 2
    # VIII. Convert the list of tokens into back to the string
    return result


In [5]:
df = pd.read_csv("tweet1000.csv", delimiter=",")
df.head()

Unnamed: 0.1,Unnamed: 0,Snippet,Sentiment
0,0,I like you Starbucks but instagram.com/p/_-bE9...,positive
1,1,RT @fuckitlist49: Triple whiskey straight. Sir...,neutral
2,2,RT @fuckitlist49: Triple whiskey straight. Sir...,neutral
3,3,RT @fuckitlist49: Triple whiskey straight. Sir...,neutral
4,4,@MiniLaddd I probably gonna get killed by load...,neutral


In [8]:
# cleaning documents
df['clean_text'] = df['Snippet'].apply(lambda row: cleaning(row))
df.head()

Unnamed: 0.1,Unnamed: 0,Snippet,Sentiment,clean_text
0,0,I like you Starbucks but instagram.com/p/_-bE9...,positive,I like you Starbucks but instagram.com/p/_-bE9...
1,1,RT @fuckitlist49: Triple whiskey straight. Sir...,neutral,RT @fuckitlist49: Triple whiskey straight. Sir...
2,2,RT @fuckitlist49: Triple whiskey straight. Sir...,neutral,RT @fuckitlist49: Triple whiskey straight. Sir...
3,3,RT @fuckitlist49: Triple whiskey straight. Sir...,neutral,RT @fuckitlist49: Triple whiskey straight. Sir...
4,4,@MiniLaddd I probably gonna get killed by load...,neutral,@MiniLaddd I probably gonna get killed by load...


In [9]:
# create TF-IDF vectorizer
tfidf_vect = TfidfVectorizer(ngram_range=(1, 3), min_df=1)
# fit the vectorizer
tfidf_vect = tfidf_vect.fit(df['clean_text'].values.astype('U'))
# transform text to TF-IDF feature space
x_data = tfidf_vect.transform(df['clean_text'].values.astype('U'))
y_label = df['Sentiment']

In [10]:
y_label.unique()

array(['positive', 'neutral', 'negative'], dtype=object)

In [11]:
# slipt data into train and test data
x_train, x_test, y_train, y_test = train_test_split(
x_data, y_label, test_size=0.33, random_state=12345)

In [8]:
# train data with SVM model
# TODO(you): try other kernel and parameters
clf = svm.SVC(kernel='linear')
clf.fit(x_train, y_train)

SVC(kernel='linear')

In [9]:
# validate the accuracy
y_predict = clf.predict(x_test)
print("accuracy ",accuracy_score(y_test, y_predict))


accuracy  0.8212121212121212


In [None]:
# Evaluate by AUC

In [12]:
# Hint: https://medium.com/analytics-vidhya/text-preprocessing-for-nlp-natural-language-processing-beginners-to-master-fd82dfecf95