### This is from a Kaggle competition: https://www.kaggle.com/c/nlp-getting-started/submit

In [1]:
import pandas as pd
import numpy as np

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [2]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
import sidetable
df_train.stb.missing(style=True)

Unnamed: 0,missing,total,percent
location,2533,7613,33.27%
keyword,61,7613,0.80%
id,0,7613,0.00%
text,0,7613,0.00%
target,0,7613,0.00%


In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vect = TfidfVectorizer(stop_words='english')
vect.fit(df_train.text)
X_train = vect.transform(df_train.text)
X_test = vect.transform(df_test.text)

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, df_train.target, stratify=df_train.target, test_size=0.05, random_state=1)
X_train = X_train.toarray()
X_val = X_val.toarray()

In [6]:
'''
Credit source: https://blog.tensorflow.org/2020/01/hyperparameter-tuning-with-keras-tuner.html
'''
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
import kerastuner as kt

def build_model(hp):
    model_type = hp.Choice('model_type', ['rfc', 'xgb'])
    if model_type == 'rfc':
        with hp.conditional_scope('model_type', 'random_forest'):
            model = RandomForestClassifier(
                bootstrap = hp.Choice('bootstrap', [True, False]),
                max_depth = hp.Choice('max_depth', [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 0]),
                max_features = hp.Choice('max_features', ['auto', 'sqrt']),
                min_samples_leaf = hp.Choice('min_samples_leaf', [1, 2, 4]),
                min_samples_split = hp.Choice('min_samples_split', [2, 5, 10]),
                n_estimators = hp.Choice('n_estimators', [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000])
            )
    elif model_type == 'xgb':
        with hp.conditional_scope('model_type', 'xgb'):
            model = XGBClassifier(
                max_depth = hp.Int('max_depth', 3, 10, 2),
                min_child_weight = hp.Int('min_child_weight', 1, 6, 2),
                gamma = hp.Float('gamma', 0.1, 0.5, 0.1),
                subsample = hp.Float('subsample', 0.6, 1, 0.1),
                colsample_bytree = hp.Float('colsample_bytree', 0.6, 1, 0.1),
                reg_alpha = hp.Choice('reg_alpha', [1e-5, 1e-2, 0.1, 1, 100]),
                learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])
            )
    else:
        raise ValueError('Unrecognized model_type')
    return model

tuner = kt.tuners.Sklearn(
        oracle=kt.oracles.BayesianOptimization(
            objective=kt.Objective('score', 'max'),
            max_trials=16),
            overwrite=True,
        hypermodel=build_model)

In [7]:
tuner.search(X_train, y_train)


Search: Running Trial #1

Hyperparameter    |Value             |Best Value So Far 
model_type        |xgb               |?                 



KeyError: "Passing list-likes to .loc or [] with any missing labels is no longer supported. The following labels were missing: Int64Index([  13,   30,  137,  146,  211,\n            ...\n            7129, 7156, 7177, 7193, 7214],\n           dtype='int64', length=291). See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_pred, y_val))

In [None]:
# saving the dataframe 
y_pred.to_csv('Predictions.csv') 

In [None]:
import winsound
duration = 2000  # milliseconds
freq = 3000  # Hz
winsound.Beep(freq, duration)