In [1]:
from sklearn.feature_extraction.text import HashingVectorizer,TfidfTransformer
from sklearn.metrics import f1_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
import pickle, os

# X_train and X_test are lists of strings, each 
# representing one document
# y_train and y_test are vectors of labels

import pandas as pd
df = pd.read_parquet("data/training.parquet")
from sklearn import model_selection
train, test = model_selection.train_test_split(df, random_state=43)
X_train = train["text"]
y_train = train["label"]

X_test = test["text"]
y_test = test["label"]
# this calculates a vector of term frequencies for 
# each document
vect = HashingVectorizer()

# this normalizes each term frequency by the 
# number of documents having that term
tfidf = TfidfTransformer()


## loading model
filename = 'models/lr_model_tfidfsummaries.sav'
logreg = pickle.load(open(filename, 'rb'))

pipeline = Pipeline([
    ('vect',vect),
    ('tfidf',tfidf),
    ('logreg',logreg)
])

# call fit as you would on any classifier
pipeline.fit(X_train,y_train)

# predict test instances
y_preds = pipeline.predict(X_test)
print(y_preds)

# calculate f1
mean_f1 = f1_score(y_test, y_preds, average='micro')
print(mean_f1)

['legitimate' 'spam' 'legitimate' ... 'spam' 'legitimate' 'legitimate']
0.9754
