# Prediction
1. Logistict Regerssion and SVM models are stored as pickles
2. Cleaned test tweets are also stored as pickle file
3. Load test tweets, Vectorize/Tokenize/Pad as needed by the respective models
4. Load Models
5. Predict the outcome and store the result in an excel

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
# nlp preprocessing libs
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

## Load cleaned test tweets

In [2]:
## Load cleaned data from pickle
#read files from the folder
path = '/content/drive/MyDrive/Guvi Files/Final Project2(Tweets)/'
test_tweets = pd.read_pickle(path+'data/test_tweets_clean.pkl')

In [3]:
test_tweets.head()

Unnamed: 0,tweet,clean_tweet
0,#studiolife #aislife #requires #passion #dedic...,to find
1,@user #white #supremacists want everyone to s...,want everyone to see the new and heres...
2,safe ways to heal your #acne!! #altwaystohe...,safe ways to heal your
3,is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...
4,"3rd #bihday to my amazing, hilarious #nephew...",to my amazing hilarious eli ahmir uncle dav...


In [4]:
# keep only clean_tweet
full_tweets = test_tweets.copy(deep=True)
test_tweets = test_tweets['clean_tweet']

In [6]:
full_tweets.head()

Unnamed: 0,tweet,clean_tweet
0,#studiolife #aislife #requires #passion #dedic...,to find
1,@user #white #supremacists want everyone to s...,want everyone to see the new and heres...
2,safe ways to heal your #acne!! #altwaystohe...,safe ways to heal your
3,is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...
4,"3rd #bihday to my amazing, hilarious #nephew...",to my amazing hilarious eli ahmir uncle dav...


## Remove stopwords and tokenize test tweets

In [7]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = stopwords.words('english')
def tokenize_and_remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return filtered_tokens

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
# tokenize and remove stop words
processed_data = list(map(tokenize_and_remove_stopwords,test_tweets))
print(processed_data)



## Lemmatize

In [9]:
#lemmatize processed_data and convert from list of words to sentence again
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
lemmatized_output = []

for lst in processed_data:
    lemmed = ' '.join([lemmatizer.lemmatize(w) for w in lst])
    lemmatized_output.append(lemmed)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
print(lemmatized_output)



## Vectorize

In [11]:
# Load the fitted TfidfVectorizer from training
with open(path + 'data/train_tfidf.pkl', 'rb') as file:
    tfidf_trained = pickle.load(file)

In [12]:
#vectorize using TFIDFVectorizer
X_test_tfidf = tfidf_trained.transform(lemmatized_output)

## Load Logistic Regression Model from Pickle

In [13]:
# load model from pickle file
with open('/content/drive/MyDrive/Guvi Files/Final Project2(Tweets)/models/final_model_lr.pkl', 'rb') as file:
    lr_model = pickle.load(file)

### Predict using LR Model

In [14]:

lr_prediction = lr_model.predict(X_test_tfidf)

In [15]:
print(lr_prediction)

[0 0 0 ... 0 0 0]


In [16]:
# convert lr_repdiction to dataframe
lr_prediction = pd.DataFrame(lr_prediction)

In [17]:
lr_prediction[0].value_counts()

Unnamed: 0_level_0,count
0,Unnamed: 1_level_1
0,15854
1,1343


In [18]:
# add the prediction to test tweets
full_tweets['lr_prediction'] = lr_prediction[0]

In [19]:
full_tweets.head()

Unnamed: 0,tweet,clean_tweet,lr_prediction
0,#studiolife #aislife #requires #passion #dedic...,to find,0
1,@user #white #supremacists want everyone to s...,want everyone to see the new and heres...,0
2,safe ways to heal your #acne!! #altwaystohe...,safe ways to heal your,0
3,is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...,0
4,"3rd #bihday to my amazing, hilarious #nephew...",to my amazing hilarious eli ahmir uncle dav...,0


## Prediction using SVM Model

In [20]:
# load SVM model stored
with open('/content/drive/MyDrive/Guvi Files/Final Project2(Tweets)/models/svm_base_model.pkl', 'rb') as file:
    svm_model = pickle.load(file)

In [21]:
svm_prediction = svm_model.predict(X_test_tfidf)

In [22]:
svm_prediction = pd.DataFrame(svm_prediction)
full_tweets['svm_prediction'] = svm_prediction[0]


In [23]:
full_tweets.head()

Unnamed: 0,tweet,clean_tweet,lr_prediction,svm_prediction
0,#studiolife #aislife #requires #passion #dedic...,to find,0,0
1,@user #white #supremacists want everyone to s...,want everyone to see the new and heres...,0,0
2,safe ways to heal your #acne!! #altwaystohe...,safe ways to heal your,0,0
3,is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...,0,0
4,"3rd #bihday to my amazing, hilarious #nephew...",to my amazing hilarious eli ahmir uncle dav...,0,0


In [24]:
# Write test_tweet to an excel file
full_tweets.to_excel('/content/drive/MyDrive/Guvi Files/Final Project2(Tweets)/models/test_tweets_prediction.xlsx', index=False)