In [52]:
import os
import joblib
import re
import string
import numpy as np
import pandas as pd

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import random

from xgboost import XGBClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

[nltk_data] Downloading package stopwords to /home/egor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:
SEP = "\t"
SEED = 42

In [54]:
random.seed(SEED)
np.random.seed(SEED)

In [80]:
label_to_num = {
    "ham": 0,
    "spam": 1
}

In [84]:
num_to_label = {
    value: key for key, value in label_to_num.items()
}

In [55]:
def make_initial_train_test_split(file_path):
    data = pd.read_csv(file_path, sep=SEP)

    train_data, test_data = train_test_split(data, random_state=SEED)
    
    folder_path = os.path.dirname(file_path)
    train_data.to_csv(
        os.path.join(folder_path, "train_data.tsv"), sep=SEP, index=None
    )
    test_data.to_csv(
        os.path.join(folder_path, "test_data.tsv"), sep=SEP, index=None
    )

In [56]:
make_initial_train_test_split("../data/data.tsv")

In [63]:
def clean_text(text):
    """
    Clean text from unnecessary symbols and
    """
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [65]:
def preprocess_msg(msg, stop_words, stemmer):

    msg = clean_text(msg)
    msg = ' '.join(word for word in msg.split(' ') if word not in stop_words)
    msg = ' '.join(stemmer.stem(word) for word in msg.split(' '))
    
    return msg

In [87]:
def preprocess_data(file_path):
    
    df = pd.read_csv(file_path, sep=SEP)
    
    stop_words = stopwords.words('english')
    stemmer = nltk.SnowballStemmer("english")
    
    df['message'] = df['message'].apply(
        preprocess_msg, 
        stop_words=stop_words,
        stemmer=stemmer
    )
    df['target'] = df['target'].apply(lambda label: label_to_num[label])
    
    return df["message"], df["target"]

In [None]:
def train_xgb_model(X, y, path_to_save):
    model = Pipeline([
        ('bow', CountVectorizer()), 
        ('tfid', TfidfTransformer()),  
        ('model', xgb.XGBClassifier(
            learning_rate=0.1,
            max_depth=7,
            n_estimators=100,
            use_label_encoder=False,
            eval_metric='auc',
        ))
    ])
    
    model.fit(X_train, y_train)
    joblib.dump(model, path_to_save)

In [None]:
def run_model(model, X, y=None):
    preds = model.predict(X)
    if y is not None:
        return preds, {"f1_score": f1_score(y, preds)}
    return preds

In [90]:
pipe = Pipeline([
    ('bow', CountVectorizer()), 
    ('tfid', TfidfTransformer()),  
    ('model', xgb.XGBClassifier(
        learning_rate=0.1,
        max_depth=7,
        n_estimators=100,
        use_label_encoder=False,
        eval_metric='auc',
    ))
])

pipe.fit(X_train, y_train)

y_pred_class = pipe.predict(X_test)
y_pred_train = pipe.predict(X_train)

print('Train: {}'.format(f1_score(y_train, y_pred_train)))
print('Test: {}'.format(f1_score(y_test, y_pred_class)))

Train: 0.9358851674641148
Test: 0.8688046647230321


In [88]:
X_train, y_train = preprocess_data("../data/train_data.tsv")
X_test, y_test = preprocess_data("../data/test_data.tsv")

In [30]:
pipe

In [31]:
import joblib

In [35]:
joblib.dump(pipe, "pipe.joblib")

Train: 0.9358851674641148
Test: 0.8688046647230321


In [None]:
import joblib

In [None]:
joblib.load("model.joblib")