## Training a simple model

We've gathered a dataset, cleaned it, formatted it, and separated it into a train and test split. We can now train our first model.

In [8]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
import joblib

import sys
sys.path.append('..')
np.random.seed(42)

import warnings
warnings.filterwarnings('ignore')

In [12]:
from ml_editor.data_processing import (
    format_raw_df,
    add_text_features_to_df,
    get_feature_vector_and_label,
    get_split_by_author,
    get_vectorized_inputs_and_label,
    get_vectorized_series,
    get_normalized_series,
    train_vectorizer
)

In [9]:
data_path = Path('../data/writers.csv')
df = pd.read_csv(data_path)
df = format_raw_df(df.copy())

df = df.loc[df['is_question']].copy()

Add features, vectorize and create a train / test split

In [10]:
df = add_text_features_to_df(df.copy())
train_df, test_df = get_split_by_author(df, test_size=0.2, random_state=42)

In [11]:
vectorizer = train_vectorizer(train_df)

In [13]:
train_df['vectors'] = get_vectorized_series(train_df['full_text'].copy(), vectorizer)
test_df['vectors'] = get_vectorized_series(test_df['full_text'].copy(), vectorizer)

In [14]:
features = [
    "action_verb_full",
    "question_mark_full",
    "text_len",
    "language_question"
]

X_train, y_train = get_feature_vector_and_label(train_df, features)
X_test, y_test = get_feature_vector_and_label(test_df, features)

Once our features and labels are ready, training a model only requires a few lines using `sklearn`.

In [15]:
clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', oob_score=True)
clf.fit(X_train, y_train)

y_predicted = clf.predict(X_test)
y_predicted_proba = clf.predict_proba(X_test)

In [16]:
y_train.value_counts()

False    3327
True     2889
Name: Score, dtype: int64

### Metrics

In [17]:
def get_metrics(y_test, y_pred):
    precision = precision_score(y_test, y_pred, pos_label=None, average='weighted')
    
    recall = recall_score(y_test, y_pred, pos_label=None, average='weighted')
    
    f1 = f1_score(y_test, y_pred, pos_label=None, average='weighted')
    
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy, precision, recall, f1

In [18]:
y_train_pred = np.argmax(clf.oob_decision_function_, axis=1)

In [20]:
from sklearn.metrics import classification_report

In [21]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

       False       0.61      0.70      0.65      3327
        True       0.58      0.48      0.52      2889

    accuracy                           0.59      6216
   macro avg       0.59      0.59      0.59      6216
weighted avg       0.59      0.59      0.59      6216



In [22]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

       False       0.62      0.63      0.63       937
        True       0.57      0.56      0.56       818

    accuracy                           0.60      1755
   macro avg       0.59      0.59      0.59      1755
weighted avg       0.60      0.60      0.60      1755



In [27]:
model_path = Path('../models/model_1.pkl')
vectorizer_path = Path('../models/vectorizer_1.pkl')
joblib.dump(clf, model_path)
joblib.dump(vectorizer, vectorizer_path)

['../models/vectorizer_1.pkl']

### Inference function

To use it on unseen data, we can define and use an inference function using our trained model.

In [28]:
from ml_editor.model_v1 import get_model_probabilities_for_input_texts

../ml_editor


In [29]:
test_q = ['bad question']
probs = get_model_probabilities_for_input_texts(test_q)

print('{:.2f} probability of the question receiving a high score according to our model'.format(probs[0][1]))

0.24 probability of the question receiving a high score according to our model
