In [1]:
import pandas as pd
import re
from textblob import TextBlob
#TF-IDF Features-F1
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import pickle

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

# load the dataset
df = pd.read_csv('../data/LGBT_Tweets_processed.csv')

In [2]:
# all lowercase
df.tweet = df.tweet.str.lower()

#Remove twitter handlers
df.tweet = df.tweet.apply(lambda x:re.sub('@[^\s]+','',x))

# remove url
df.tweet = df.tweet.apply(lambda x:re.sub(r"http\S+", "", x))

# Substituting multiple spaces to single space
df.tweet = df.tweet.apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))

#remove hashtags
df.tweet = df.tweet.apply(lambda x:re.sub(r'\B#\S+','',x))

#remove all single characters
df.tweet = df.tweet.apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))

In [3]:
#Perform sentiment analysis using TextBlob on data
df['sentiment'] = df['tweet'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['sentiment_label'] = df['sentiment'].apply(lambda x: 'positive' if x > 0 else ('negative' if x < 0 else 'Neutral'))

In [4]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),max_df=1.0, min_df=0.0, max_features=5)

# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(df['tweet'])

In [5]:
X = tfidf
y = df['sentiment_label']
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X.toarray(), y, random_state=42, test_size=0.2)
support =LinearSVC(random_state=20)
support.fit(X_train_tfidf,y_train)
y_preds = support.predict(X_test_tfidf)
acc3=accuracy_score(y_test,y_preds)
report = classification_report( y_test, y_preds)
print(report)
print("SVM, Accuracy Score:" , acc3)

              precision    recall  f1-score   support

     Neutral       0.54      0.39      0.45      1875
    negative       0.00      0.00      0.00      1550
    positive       0.51      0.86      0.64      3067

    accuracy                           0.52      6492
   macro avg       0.35      0.42      0.37      6492
weighted avg       0.40      0.52      0.43      6492

SVM, Accuracy Score: 0.5192544670363525


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Saving pretrained model and vectorizer as pkl files

In [6]:
import pickle

pickle.dump(support, open('../model/model.pkl', 'wb'))

pickled_model = pickle.load(open('../model/model.pkl', 'rb'))

print(pickled_model.predict(X))

pickle.dump(tfidf_vectorizer, open('../model/tfidf_vectorizer.pkl', 'wb'))

tfidf_vectorizer_pkl = pickle.load(open('../model/tfidf_vectorizer.pkl', 'rb'))

['Neutral' 'Neutral' 'positive' ... 'Neutral' 'positive' 'positive']


In [7]:
# test on predicting 1 tweet

test_tweet = 'this is a tweet, tell me whether it is positive or negative'
tfidf_test = tfidf_vectorizer_pkl.fit_transform([test_tweet])
pickled_model.predict(tfidf_test)

array(['positive'], dtype=object)

In [8]:
# test on predicting an array of tweet
test_2 = "it need to be 20 character long"
test_3 = "this is very bad model"
test_4 = "being gay is bad, how is this positive. Does it ever go negative?"
test_5 = "should I even depploy this model???? negative, negative"

# list_of_tweet = [test_tweet, test_2, test_3, test_4, test_5]
tfidf_list_of_tweet = tfidf_vectorizer.fit_transform([test_tweet, test_2, test_3, test_4, test_5])

pickled_model.predict(tfidf_list_of_tweet)

array(['positive', 'Neutral', 'positive', 'positive', 'positive'],
      dtype=object)

# Summary

- For deployment, I chose the SVM model with tfidf_vectorizer since it gives the best result out of all the models.
- I wanted to have a product that can take in a tweet and classify whether that tweet is positive, neutral, or negative. I did a couple of tests on the model to find out that it is not practical to have a tweet with 10000 features (a hyperparameter that I previously used for tfidf_vectorizer)
- To deploy a model quickly so I can integrate it, I adjusted the parameters of tfidf_vectorizer from
` tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),max_df=0.75, min_df=5, max_features=10000) `
to 
` tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),max_df=1.0, min_df=0.0, max_features=5)`
which significantly reduces 10000 features to 5 features. This also reduces the accuracy score from `0.8561306223043746` to `0.5192544670363525`
- with only 5 features, a tweet needs to be longer (maybe longer than 10 characters for the model to make a prediction)

- I also save the pre-trained model and vectorizer to improve the latency of the final product.
- **Through this preparation for the deployment process, it is shown that:** <br>
1: Sometimes it is important to let the data guide the product development process. it is also important to have an end product in mind <br>
2: while a model may work great in a development environment, there are a lot of challenges raised to bring that greatness to the production environment. <br>