In [26]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score


In [5]:
train_path = 'dataset/train.csv'
test_path = 'dataset/test.csv'
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

In [27]:
bayes_clf = Pipeline([
    ("tfidf", TfidfVectorizer(lowercase=True)),
    ("clf", MultinomialNB())
])
tree_clf = Pipeline([
    ("tfidf", TfidfVectorizer(lowercase=True)),
    ("clf", RandomForestClassifier())
])

In [20]:
X_train, y_train = train_data.Description, train_data.Category == 'Adult'
X_test, y_test = test_data.Description, test_data.Category == 'Adult'

In [31]:
def val_score(y, y_pred):
    p = precision_score(y, y_pred)
    r = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    return p, r, f1

In [40]:
def fit_and_score(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    p, r, f1 = val_score(y_test, y_pred)
    # print(f"P: {p}\nR: {r}\nF1: {f1}")
    return {
        "precision": p,
        "recall": r,
        "f1": f1
    }

In [42]:
tree_metrics = fit_and_score(tree_clf, X_train, y_train, X_test, y_test)
bayes_metrics = fit_and_score(bayes_clf, X_train, y_train, X_test, y_test)

In [43]:
import json
result = {
    'tree': tree_metrics,
    'bayes': bayes_metrics
}
with open('result.json', 'w') as f:
    json.dump(result, f, indent=4)