In [1]:
# Imports
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV, ShuffleSplit
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics import r2_score
from collections import defaultdict

# My scripts 
import nltk_helper as nh

%matplotlib inline

plt.style.use('seaborn')

In [2]:
# get data 
df = pd.read_pickle('../data/data.pkl')

In [3]:
drop_col = ['tags', 'date_last_updated','rating', 'rating_gte_4', 'warning']
text_col = ['summary', 'ch1', 'ch2', 'ch3', 'ch4', 'ch5']

In [4]:
# Get columns 
X = df.drop(drop_col, axis = 1)
y = df[['rating_gte_4', 'rating']]

In [6]:
# Do train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)  

In [7]:
# Get titles 
X_train_titles = X_train.title
X_test_titles = X_test.title

'''
X_train.drop('title', axis=1, inplace=True)
X_test.drop('title', axis=1, inplace=True)
'''

"\nX_train.drop('title', axis=1, inplace=True)\nX_test.drop('title', axis=1, inplace=True)\n"

In [8]:
print(f'Train Shape: {X_train.shape}   Test Shape: {X_test.shape}')

Train Shape: (2053, 95)   Test Shape: (514, 95)


---
---
**Make text columns into NMFs**

In [10]:
# Make text vectors
tfid = dict()
for col in text_col:
    tfid[col] = TfidfVectorizer(tokenizer = nh.prep_text)
    
    # Fit tfid
    tfid[col].fit(X_train[col])



In [29]:
# Fit NMF 
def fit_nmf(data, tfid, n = 10, max_i = 500):
    # Transform data 
    X = tfid.transform(data)
    
    nmf = NMF(n_components = n, max_iter = max_i)
    nmf.fit(X)
    
    W = nmf.transform(X)
    H = nmf.components_
    
    df_H = pd.DataFrame(H, columns = tfid.get_feature_names())
    
    # Get Topic name 
    column_names = [f'{data.name}_topic_{i}' for i in range(n)]

    df_W = pd.DataFrame(W, index = data.index, columns = column_names)
    
    # Reset index 
    df_W
    df_H
    
    return nmf, df_W, df_H

In [33]:
print(X_train.shape)
_, W, H = fit_nmf(X_train['summary'], tfid['summary'], X_train_titles)
# X_train = 
print(W.shape)
print(X_train.merge(W, left_index=True, right_index=True).shape)
print(X_train.shape)

(2053, 95)
(2053, 10)
(2053, 105)
(2053, 95)


In [31]:
print(W.index)

Int64Index([1267,    1,  384, 2220,  453, 1108,  989, 1732,  713, 1129,
            ...
             705, 2362, 1828, 1778,  277, 1033, 1731,  763,  835, 1653],
           dtype='int64', length=2053)


In [32]:
print(X_train.index)

Int64Index([1267,    1,  384, 2220,  453, 1108,  989, 1732,  713, 1129,
            ...
             705, 2362, 1828, 1778,  277, 1033, 1731,  763,  835, 1653],
           dtype='int64', length=2053)


In [None]:
for col in text_col:
    # NMF or train data 
    _, W, H = fit_nmf(X_train[col], tfid[col], X_train_titles)
    X_train = X_train.merge(W, left_index=True, right_index=True )
    
    
    # NMF or test data 
    _, W, H = fit_nmf(X_test[col], tfid[col], X_test_titles)
    X_test = X_test.merge(W, left_index=True, right_index=True)


In [None]:
X_test

In [None]:
nmf_sum, summary_topic, sum_wtopic = fit_nmf(X_train.summary, X_train_titles, 3)
nmf_ch1, ch1_ctopic, ch1_wtopic = fit_nmf(X_train.ch1, X_train_titles, 3)
nmf_ch2, ch2_ctopic, ch2_wtopic = fit_nmf(X_train.ch2, X_train_titles, 3)
nmf_ch3, ch3_ctopic, ch3_wtopic = fit_nmf(X_train.ch3, X_train_titles, 3)
nmf_ch4, ch4_ctopic, ch4_wtopic = fit_nmf(X_train.ch4, X_train_titles, 3)
nmf_ch5, ch5_ctopic, ch5_wtopic = fit_nmf(X_train.ch5, X_train_titles, 3)
# ['summary', 'ch1', 'ch2', 'ch3', 'ch4', 'ch5', 'rating_gte_4', 'warning']

In [None]:
# Make a tifd vector
tfid = TfidfVectorizer(tokenizer = nh.prep_text)

# Fit summary features 
tfid.fit(X_train.summary)
    
# Transform data 
X = tfid.transform(X_test.summary)

In [None]:
nmf_sum.transform(X).shape

---
---
**Make Random Forest Model**

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 80, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rfr = RandomForestRegressor()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rfr_random = RandomizedSearchCV(estimator = rfr, param_distributions = random_grid, 
                                n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rfr_random.fit(X_train, yreg_train)

# Return best param
best_params = rfr_random.best_params_
best_params

In [None]:
# CV 
rfr = RandomForestRegressor(random_state=0, **best_params)
scores = defaultdict(list)

# CV with shuffle splits 
splitter = ShuffleSplit(100, test_size  = 0.2)
names = X.columns

# Do some CV 
for train_index, test_index in splitter.split(X_train, yreg_train):
    X_traincv, X_testcv = X_train.values[train_index], X_train.values[test_index]
    y_traincv, y_testcv = yreg_train.values[train_index], yreg_train.values[test_index]
    
    # Fit model 
    rfr.fit(X_traincv, y_traincv)
    
    # Make get accuracy 
    acc = r2_score(y_testcv, rfr.predict(X_testcv))
    
    # Make some prediction 
    for i in range(X_train.shape[1]):
        X_t = X_testcv.copy()
        np.random.shuffle(X_t[:, i])
        shuff_acc = r2_score(y_testcv, rfr.predict(X_t))
        scores[names[i]].append((acc-shuff_acc)/acc)

score_series = pd.DataFrame(scores).mean()
scores = pd.DataFrame({'Mean Decrease Accuracy' : score_series})

In [None]:
scores.sort_values(by='Mean Decrease Accuracy', ascending = False)[0:10].plot(kind='barh', figsize=(6,4))

In [None]:
r2_score(yreg_test, rfr.predict(X_test))

In [None]:
scores.sort_values(by='Mean Decrease Accuracy', ascending = False)[0:10].index

In [None]:
rfr.