In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pickle

In [2]:
df = pd.read_excel('03-data-labelling.xlsx')

training_data, testing_data = train_test_split(df, random_state=2000)

Y_train = training_data['Advertisement'].values
Y_test = testing_data['Advertisement'].values


In [3]:
def extract_features(df, field, training_data, testing_data):
    # TF-IDF BASED FEATURE REPRESENTATION
    tfidf_vectorizer = TfidfVectorizer(use_idf = True, max_df = 0.95)
    tfidf_vectorizer.fit_transform(training_data[field].values.astype('U'))

    train_feature_set = tfidf_vectorizer.transform(training_data[field].values.astype('U'))
    test_feature_set = tfidf_vectorizer.transform(testing_data[field].values.astype('U'))

    return train_feature_set, test_feature_set, tfidf_vectorizer

X_train, X_test, vectorizer = extract_features(
    df, 'Tweets', training_data, testing_data)


In [4]:
from sklearn.linear_model import LogisticRegression

logisticReg = LogisticRegression(
    verbose = 1,
    solver = 'liblinear',
    random_state = 0,
    C = 5,
    penalty = 'l2',
    max_iter = 1000
    )
model = logisticReg.fit(X_train, Y_train)

logisticRegScore = model.score(X_test, Y_test)
print (logisticRegScore)


[LibLinear]0.704


In [5]:
from sklearn.naive_bayes import MultinomialNB

naiveBayesModel = MultinomialNB().fit(X_train, Y_train)

naiveBayesScore = naiveBayesModel.score(X_test, Y_test)
print (naiveBayesScore)

0.72


In [6]:
# Save model
filename = 'model.sav'
pickle.dump(naiveBayesModel, open(filename, 'wb'))

# Save vectorizer
pickle.dump(vectorizer, open("vectorizer.pickle", "wb"))