In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
import pickle

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

## Set parameters

In [3]:
# Set parameters
min_allowed_occurence = 10
max_allowed_frequency_rate = 0.95
labels = ['negative', 'positive']
destination = 'django_project/models/'

## Load and transform data

In [16]:
# Load data
df = pd.read_csv('data.csv').sample(n = 20000)
X_train, X_temp, y_train, y_temp = train_test_split(df['text'], df['sentiment'], test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Transform features to bag of word representation
vec = TfidfVectorizer(min_df = min_allowed_occurence, max_df = max_allowed_frequency_rate)
vec.fit(X_train)
X_train_features = vec.transform(X_train)
X_test_features = vec.transform(X_test)
X_val_features = vec.transform(X_val)

## Train a suport vector classifier

In [17]:
# List of the models we want to test.
models = [
          ('LR ', LogisticRegression()), 
          ('KNN', KNeighborsClassifier()),
          ('SVC', LinearSVC()),
          ('DT ', DecisionTreeClassifier()),
          ('RF ', RandomForestClassifier()),
          ('GB ', GradientBoostingClassifier())
        ]

# Loop through, test every one and save the cross validation score.
results = []
names = []
for name, model in models:
    model.fit(X_train_features, y_train)
    result = model.score(X_val_features, y_val)
    names.append(name)
    results.append(result)

In [18]:
# Print the score for each model.
print("validation scores:")
for i in range(len(names)):
    print(names[i],results[i].mean())

validation scores:
LR  0.759
KNN 0.577
SVC 0.756
DT  0.641
RF  0.7285
GB  0.6965


## Save models 

In [5]:
# Save model using pickle
SVC = models[2]
pickle.dump(SVC, open(destination+'model.sav', 'wb'))
pickle.dump(vec, open(destination+'vectorizer.pk', 'wb'))