### 1. Loading data

In [9]:
import json
from collections import Counter

import numpy as np
import pandas as pd
import requests
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

In [5]:
def load_zenodo_open_metadata(filename):
    """
    Loading Zenodo open metadata from:
    - zenodo archive in json format (list of json records)
    - zenodo archive in jsonl format
    - zenodo dump in jsonl format
    """
    if filename.endswith(".json"):
        with open(filename, "r") as data_file:
            json_data = data_file.read()
        data = json.loads(json_data)
        return data
    elif filename.endswith(".txt"):
        with open(filename, "r") as fp:
            data = [json.loads(l) for l in fp.readlines()]
        return data
    else:
        print("Unsupported archive format")


FOLDER = "datasets"
FILENAMES = [
    "zenodo_open_metadata_06_03_2017.json",
    "zenodo_open_metadata_06_04_2017.json",
    "zenodo_open_metadata_17_05_2018.txt",
    "zenodo_open_metadata_2019-11-04T12:21:26.052379.txt",
]
FILENAMES = ["./{}/{}".format(FOLDER, filename) for filename in FILENAMES]

data_raw = []
for filename in FILENAMES:
    data_raw.append(load_zenodo_open_metadata(filename))
    print("Data from {} has been loaded".format(filename))

Data from ./datasets/zenodo_open_metadata_06_03_2017.json has been loaded
Data from ./datasets/zenodo_open_metadata_06_04_2017.json has been loaded
Data from ./datasets/zenodo_open_metadata_17_05_2018.txt has been loaded
Data from ./datasets/zenodo_open_metadata_2019-11-04T12:21:26.052379.txt has been loaded


### 3. Applying model

In [12]:
pd_data = pd.DataFrame(data_raw[3])
y = pd_data.spam
X = pd_data[["description", "title"]]
X["full"] = X["description"] + X["title"]
feature = "full"

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=422)

text_clf = Pipeline(
    [
        ("tfidf", TfidfVectorizer(max_features=8000, ngram_range=(1, 1))),
        ("clf", RandomForestClassifier(n_estimators=10)),
    ]
)

text_clf = text_clf.fit(X_train[feature], y_train)
y_pred = text_clf.predict(X_valid[feature])
acc = [(ref, pred) for ref, pred in zip(y_valid, y_pred)]
c = Counter(acc)
print(c)
print("Spam->Spam: {0:.4f}".format(c[(True, True)] / (c[(True, True)] + c[(True, False)])))
print("Ham -> Ham: {0:.4f}".format(c[(False, False)] / (c[(False, False)] + c[(False, True)])))
print("Accuracy: {0:.4f}".format((c[(False, False)] + c[(True, True)] ) / (len(acc))))

# IMPROVE METRICS



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Counter({(False, False): 483307, (True, True): 3580, (True, False): 461, (False, True): 43})
Spam->Spam: 0.8859
Ham -> Ham: 0.9999
Accuracy: 0.9990


In [13]:
from sklearn.externals import joblib
joblib.dump(text_clf, '2019_11_04_reports_spam.pkl') 





['2019_11_04_reports_spam.pkl']

##### Results

###### Accuracy of model for data up to 06/03/2017
Counter({(False, False): 56495, (True, True): 1652, (False, True): 140, (True, False): 38})<br/>
Spam->Spam: 0.9775<br/>
Ham -> Ham: 0.9975<br/>
Accuracy: 0.9969<br/>
\----
###### Accuracy of model for data up to 06/04/2017
Counter({(False, False): 58746, (True, True): 1855, (True, False): 65, (False, True): 54})<br/>
Spam->Spam: 0.9661<br/>
Ham -> Ham: 0.9991<br/>
Accuracy: 0.9980<br/>
\----
###### Accuracy of model for data up to 17/05/2018
Counter({(False, False): 133426, (True, True): 2627, (True, False): 149, (False, True): 128})<br/>
Spam->Spam: 0.9463<br/>
Ham -> Ham: 0.9990<br/>
Accuracy: 0.9980<br/>
\----
###### Accuracy of model for data up to 04/11/2019
Counter({(False, False): 482380, (True, True): 3526, (False, True): 970, (True, False): 515})<br/>
Spam->Spam: 0.8726<br/>
Ham -> Ham: 0.9980<br/>
Accuracy: 0.9970<br/>
\----
###### Accuracy of model for data up to 04/11/2019 with RandomForest Model
Counter({(False, False): 483307, (True, True): 3580, (True, False): 461, (False, True): 43})<br/>
Spam->Spam: 0.8859<br/>
Ham -> Ham: 0.9999<br/>
Accuracy: 0.9990<br/>

In [19]:
import plotly.graph_objects as go

class CurrentModel:
    
    def __init__(self, acc, spam_spam, ham_ham, name):
        self.acc = acc
        self.spam_spam = spam_spam
        self.ham_ham = ham_ham
        self.name = name
        
    def plot(self):
        x = ['Acc', 'Spam', 'Ham']
        y = [self.acc, self.spam_spam, self.ham_ham]
        fig = go.Figure()
        fig.add_trace(go.Histogram(
            x=x,
            y=y,
            name=name
        ))
        fig.show()

def plot_all(models):
    x = ['Acc', 'Spam', 'Ham']
    fig = go.Figure()
    for model in models:
        fig.add_trace(go.Bar(
            x=x,
            y=[model.acc, model.spam_spam, model.ham_ham],
            name=model.name
        ))
    fig.update_yaxes(range=[0.85, 1])
    fig.show()
    
model_06_03_2017 = CurrentModel(0.9969, 0.9975, 0.9991, 'data up to 06/03/2017')
model_06_04_2017 = CurrentModel(0.9980, 0.9661, 0.9975, 'data up to 06/04/2017')
model_17_05_2018 = CurrentModel(0.9980, 0.9463, 0.9990, 'data up to 17/05/2018')
model_04_11_2019 = CurrentModel(0.9990, 0.8726, 0.9980, 'data up to 04/11/2019')
model_04_11_2019_RandomForest = CurrentModel(0.9990, 0.8859, 0.9999, 'data up to 04/11/2019 <RandomForest>')

models = [
    model_06_03_2017,
    model_06_04_2017,
    model_17_05_2018,
    model_04_11_2019,
    model_04_11_2019_RandomForest
]

plot_all(models)