# Project 13: Text Analysis
---

## The Dataset

## Imports

In [9]:
import math

import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

from sklearn.dummy import DummyClassifier

from tqdm.auto import tqdm
import sklearn.metrics as metrics

%matplotlib inline
%config InlineBackend.figure_format = 'png'
# the next line provides graphs of better quality on HiDPI screens
%config InlineBackend.figure_format = 'retina'

plt.style.use('seaborn')

In [3]:
# this is to use progress_apply, read more at https://pypi.org/project/tqdm/#pandas-integration
tqdm.pandas()

## Load and Preprocess Data

In [4]:
data = pd.read_pickle('../data/model_data.pkl')

df = data[['clean_description', 'price_realised']]
print(df.head(2))
print(f'Num Datapoints = {len(df)}')

                                   clean_description  price_realised
0  \nAffixed to a stained mahogany stand, restore...          1875.0
1  \n10 in. (25.5 cm.) high, 15 in. (38 cm.) wide...          1625.0
Num Datapoints = 53903


## Evaluation Procedure

Composing an evaluation routine which can be used for all models in this project

In [8]:
def evaluate_model(model, train_features, train_target, test_features, test_target):
    
    eval_stats = {}
    
    fig, axs = plt.subplots(1, 3, figsize=(20, 6)) 
    
    for type, features, target in (('train', train_features, train_target), ('test', test_features, test_target)):
        
        eval_stats[type] = {}
    
        pred_target = model.predict(features)
        pred_proba = model.predict_proba(features)[:, 1]
        
        # F1
        f1_thresholds = np.arange(0, 1.01, 0.05)
        f1_scores = [metrics.f1_score(target, pred_proba>=threshold) for threshold in f1_thresholds]
        
        # ROC
        fpr, tpr, roc_thresholds = metrics.roc_curve(target, pred_proba)
        roc_auc = metrics.roc_auc_score(target, pred_proba)    
        eval_stats[type]['ROC AUC'] = roc_auc

        # PRC
        precision, recall, pr_thresholds = metrics.precision_recall_curve(target, pred_proba)
        aps = metrics.average_precision_score(target, pred_proba)
        eval_stats[type]['APS'] = aps
        
        if type == 'train':
            color = 'blue'
        else:
            color = 'green'

        # F1 Score
        ax = axs[0]
        max_f1_score_idx = np.argmax(f1_scores)
        ax.plot(f1_thresholds, f1_scores, color=color, label=f'{type}, max={f1_scores[max_f1_score_idx]:.2f} @ {f1_thresholds[max_f1_score_idx]:.2f}')
        # setting crosses for some thresholds
        for threshold in (0.2, 0.4, 0.5, 0.6, 0.8):
            closest_value_idx = np.argmin(np.abs(f1_thresholds-threshold))
            marker_color = 'orange' if threshold != 0.5 else 'red'
            ax.plot(f1_thresholds[closest_value_idx], f1_scores[closest_value_idx], color=marker_color, marker='X', markersize=7)
        ax.set_xlim([-0.02, 1.02])    
        ax.set_ylim([-0.02, 1.02])
        ax.set_xlabel('threshold')
        ax.set_ylabel('F1')
        ax.legend(loc='lower center')
        ax.set_title(f'F1 Score') 

        # ROC
        ax = axs[1]    
        ax.plot(fpr, tpr, color=color, label=f'{type}, ROC AUC={roc_auc:.2f}')
        # setting crosses for some thresholds
        for threshold in (0.2, 0.4, 0.5, 0.6, 0.8):
            closest_value_idx = np.argmin(np.abs(roc_thresholds-threshold))
            marker_color = 'orange' if threshold != 0.5 else 'red'            
            ax.plot(fpr[closest_value_idx], tpr[closest_value_idx], color=marker_color, marker='X', markersize=7)
        ax.plot([0, 1], [0, 1], color='grey', linestyle='--')
        ax.set_xlim([-0.02, 1.02])    
        ax.set_ylim([-0.02, 1.02])
        ax.set_xlabel('FPR')
        ax.set_ylabel('TPR')
        ax.legend(loc='lower center')        
        ax.set_title(f'ROC Curve')
        
        # PRC
        ax = axs[2]
        ax.plot(recall, precision, color=color, label=f'{type}, AP={aps:.2f}')
        # setting crosses for some thresholds
        for threshold in (0.2, 0.4, 0.5, 0.6, 0.8):
            closest_value_idx = np.argmin(np.abs(pr_thresholds-threshold))
            marker_color = 'orange' if threshold != 0.5 else 'red'
            ax.plot(recall[closest_value_idx], precision[closest_value_idx], color=marker_color, marker='X', markersize=7)
        ax.set_xlim([-0.02, 1.02])    
        ax.set_ylim([-0.02, 1.02])
        ax.set_xlabel('recall')
        ax.set_ylabel('precision')
        ax.legend(loc='lower center')
        ax.set_title(f'PRC')        

        eval_stats[type]['Accuracy'] = metrics.accuracy_score(target, pred_target)
        eval_stats[type]['F1'] = metrics.f1_score(target, pred_target)
    
    df_eval_stats = pd.DataFrame(eval_stats)
    df_eval_stats = df_eval_stats.round(2)
    df_eval_stats = df_eval_stats.reindex(index=('Accuracy', 'F1', 'APS', 'ROC AUC'))
    
    print(df_eval_stats)
    
    return

## Normalization

We assume all models below accepts texts in lowercase and without any digits, punctuations marks etc.

In [5]:
df['clean_description'] = df['clean_description'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


## Train / Test Split

Luckily, the whole dataset is already divided into train/test one parts. The corresponding flag is 'ds_part'.

In [7]:
dat = df.sample(frac=1)
train_idx = int(len(dat) * 0.8)
train = dat[:train_idx]
test = dat[train_idx:]

train_text = train['clean_description']
test_text = test['clean_description']

train_targ = train['price_realised']
test_targ = test['price_realised']

print(train_text.shape)
print(test_text.shape)
print(train_targ.shape)
print(test_targ.shape)

(43122,)
(10781,)
(43122,)
(10781,)


## Model 0 - Constant

In [27]:
dum = DummyClassifier(random_state=1)
dum.fit(train_text, train_targ)
dum_pred = dum.predict(test_text)
print(metrics.mean_absolute_error(test_targ, dum_pred))

354466.1169047399


## Model 1 - NLTK, TF-IDF and LR

### TF-IDF

In [33]:
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb

from nltk.corpus import stopwords

In [31]:
def tfidf_preprocessing(train_feature_text, test_feature_text):
    stop_words = set(stopwords.words('english'))
    vec = TfidfVectorizer(stop_words=stop_words)

    train_tfidf = vec.fit_transform(train_feature_text)
    test_tfidf = vec.transform(test_feature_text)
    
    return train_tfidf, test_tfidf

train_tfidf, test_tfidf = tfidf_preprocessing(train_text, test_text)
smol_tr = train_tfidf[:10000]
smol_ts = test_tfidf[:2000]
y1_smol = train_targ[:10000]
y2_smol = test_targ[:2000]

print(train_tfidf.shape)
print(test_tfidf.shape)

(43122, 47384)
(10781, 47384)


In [29]:
dt = DecisionTreeRegressor().fit(smol_tr, y1_smol)
dt_p = dt.predict(smol_ts)
print(metrics.mean_absolute_error(y2_smol, dt_p))

dt1 = DecisionTreeRegressor().fit(train_tfidf, train_targ)
dt1_p = dt1.predict(test_tfidf)
print(metrics.mean_absolute_error(test_targ, dt1_p))

509395.5036959888
499203.95610983414


In [32]:
rf = RandomForestRegressor(random_state=1)
rf.fit(smol_tr, y1_smol)
rf_pred = rf.predict(smol_ts)
print(metrics.mean_absolute_error(y2_smol, rf_pred))

450507.7198169687


In [56]:
pickle.dump(rf, open('rf_tfidf.pkl', 'wb'))

In [36]:
# xgb
xg = xgb.XGBRegressor().fit(train_tfidf, train_targ)
xgp = xg.predict(test_tfidf)
print(metrics.mean_absolute_error(test_targ, xgp))

499667.5569045351


In [None]:
# save fitted model with pickle


## Model 2 - spaCy, TF-IDF and LR

In [39]:
import spacy

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [40]:
train_text = train_text.reset_index(drop=True)
train_text

0        \ncandy bars from candy counter in the store \...
1                                                         
2                                                         
3        \ncane allo specchio (dog in the mirror) \npla...
4        \nan american ship in hong kong harbor \noil o...
                               ...                        
43117    \n\n2 volumes in-12 (176 x 103 mm). frontispic...
43118    \nolive trees \nsigned and dated 'john aldridg...
43119    \nle cirque à l'étoile (m. 436) \nlithograph i...
43120    \nmost 20th century, \nthree cups, <i>mark of ...
43121    \n(indonesian, 1907-1990)\nbarong\nsigned with...
Name: clean_description, Length: 43122, dtype: object

In [41]:
train_text_list = list(train_text)

In [42]:
def spacy_lemma(series_text):
    
    new_series = []
    for row in list(series_text.index):
        doc = nlp(series_text.loc[row])
        #tokens = [token.lemma_ for token in doc if not token.is_stop]
        tokens = [token.lemma_ for token in doc]
    
        text = ' '.join(tokens)
        new_series.append(text)
    return pd.Series(new_series)

train_text_s = spacy_lemma(train_text)
test_text_s = spacy_lemma(test_text)

print('after spacy')
print(train_text_s.shape)
print(test_text_s.shape)

train_text_ts, test_text_ts = tfidf_preprocessing(train_text_s, test_text_s)

print('---')
print('after tfidf')
print(train_text_ts.shape)
print(test_text_ts.shape)

after spacy
(43122,)
(10781,)
---
after tfidf
(43122, 42807)
(10781, 42807)


In [43]:
xg = xgb.XGBRegressor().fit(train_text_ts, train_targ)
xgp = xg.predict(test_text_ts)
print(metrics.mean_absolute_error(test_targ, xgp))

498811.85662448563


## Model 3 - spaCy, TF-IDF and LGBMClassifier

In [49]:
from sklearn.model_selection import train_test_split

# splitting the train set for 20% validation to tune parameters max_depth & num_leaves

train_ts, val_ts, ytrain, yval = train_test_split(train_text_ts, train_targ, test_size=0.2, random_state=1)

print(train_ts.shape)
print(val_ts.shape)
print(ytrain.shape)
print(yval.shape)

(34497, 42807)
(8625, 42807)
(34497,)
(8625,)


In [52]:
lgbm = xgb.XGBRegressor(max_depth=35, random_state=1)
lgbm.fit(train_text_ts, train_targ)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=35, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=1, reg_alpha=0,
             reg_lambda=1, ...)

In [53]:
import pickle

In [55]:
pickle.dump(lgbm, open('xgboost_tfidf.pkl', 'wb'))