In [1]:
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
import pickle, os

# X_train and X_test are lists of strings, each 
# representing one document
# y_train and y_test are vectors of labels

import pandas as pd
df = pd.read_parquet("data/training.parquet")
from sklearn import model_selection
train, test = model_selection.train_test_split(df, random_state=43)
X_train = train["text"]
y_train = train["label"]

X_test = test["text"]
y_test = test["label"]
# this calculates a vector of term frequencies for 
# each document

## loading in feature vectors pipeline
import pickle, os
filename = 'feature_pipeline.sav'
feat_pipeline = pickle.load(open(filename, 'rb'))

## loading model
filename = 'model.sav'
model = pickle.load(open(filename, 'rb'))



pipeline = Pipeline([
    ('features',feat_pipeline),
    ('model',model)
])

# call fit as you would on any classifier
# pipeline.fit(X_train,y_train)

# predict test instances
y_preds = pipeline.predict(X_test)
print(y_preds)

# calculate f1
mean_f1 = f1_score(y_test, y_preds, average='micro')
print(mean_f1)

['legitimate' 'spam' 'legitimate' ... 'spam' 'legitimate' 'legitimate']
0.9629


In [2]:
ddf = pd.read_json("""[{"text" : "It is a truth universally acknowledged"}, 
{"text" : "I have never seen a more disgusting dog food"}]""", orient="records")

In [6]:
pipeline.predict(ddf.squeeze())

array(['legitimate', 'spam'], dtype=object)

In [4]:
type(X_test.sample(10))

pandas.core.series.Series

In [54]:
ddf2 = pd.read_json("""["dog food", "Well, she went on to say something sensible, but knew not what answer she returned to the Park, and Elinor was not blinded by the beauty, or the shrewd look of the youngest, to her want of sense."]""", orient="records")
ddf2

Unnamed: 0,0
0,dog food
1,"Well, she went on to say something sensible, b..."


In [55]:
pred = ddf2
if len(ddf2.values) > 1 and len(ddf2.columns) == 1:
    pred = ddf2.squeeze()
pipeline.predict(pred)

array(['spam', 'legitimate'], dtype=object)

In [74]:
ddf3 = pd.read_json("""["It is a truth universally acknowledged"]""", orient="records")
ddf3

Unnamed: 0,0
0,It is a truth universally acknowledged


In [75]:
pred = ddf3
if len(ddf3.values) > 1 and len(ddf3.columns) == 1:
    pred = ddf3.squeeze()
else:
    pred = [ddf3.squeeze()]
pipeline.predict(pred)

array(['legitimate'], dtype=object)

In [73]:
df["text"].sample(2).to_json(orient="records")

'["Mr. Dashwood\'s disappointment was, at first, severe; but his temper was not his great perfection; and, indeed, with such a woman therefore there would be no difference of sentiment. It is a lovely night, and they are very pleasing women when you converse with them. His mother and sisters were spared much solicitude on her account. Mr. Tilney drank tea with us, and I have something of consequence to anybody.","Do not hesitate to eat them for breakfast you\'d probably want it sweeter. Highly recommended, the price is unbeatable for the quality. I commonly order jerky from all over the world! By the time I got the banana nut bread bar was absolutely delicious. This is really a great product. PLEASE start selling again!!! Okay another reviewer pointed out, though it is clearly labeled berry and they swiped their sticker that says Back Mountain Gift Baskets. But worth every penny."]'

In [68]:
pipeline.predict()

array(['legitimate'], dtype=object)