In [None]:
%%capture
import sys

if 'google.colab' in sys.modules:
    DATA_PATH = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Kaggle-Challenge/main/data/'
    !pip install category_encoders==2.*
    !pip install pandas-profiling==2.*

else:
    DATA_PATH = '../data/'

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
%cd /content/gdrive/My Drive/Kaggle

In [None]:
%%capture
!pip install category_encoders==2.*
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, validation_curve, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from category_encoders import OrdinalEncoder

In [None]:
def wrangle(fm_path, tv_path=None):
    if tv_path:
        df = pd.merge(pd.read_csv(fm_path, 
                                  na_values=[0, -2.000000e-08]),
                      pd.read_csv(tv_path)).set_index('id')
    else:
        df = pd.read_csv(fm_path, 
                         na_values=[0, -2.000000e-08],
                         index_col='id')

    # Drop constant columns
    df.drop(columns=['recorded_by'], inplace=True)

    # Drop HCCCs
    cutoff = 100
    drop_cols = [col for col in df.select_dtypes('object').columns
                 if df[col].nunique() > cutoff]
    df.drop(columns=drop_cols, inplace=True)

    # Drop duplicate columns
    dupe_cols = [col for col in df.head(100).T.duplicated().index
                 if df.head(100).T.duplicated()[col]]
    df.drop(columns=dupe_cols, inplace=True)             

    return df

In [None]:
df = wrangle(fm_path = 'train_features.csv', tv_path = 'train_labels.csv')
X_test = wrangle(fm_path = 'test_features.csv')

In [None]:
X = df.drop(columns = 'status_group')
y = df['status_group']

In [None]:
baseline_acc = y.value_counts(normalize = True).max()
print('Baseline Accuracy Score:', baseline_acc)

In [None]:
clf_rf = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    RandomForestClassifier(random_state = 42)
)

In [None]:
cv_scores_rf = cross_val_score(clf_rf, X, y, cv= 5, n_jobs = -1)

In [None]:
print('CV score RandomForestClassifier')
print(cv_scores_rf)
print('Mean CV accuracy score:', cv_scores_rf.mean())
print('STD CV accuracy score:', cv_scores_rf.std())

In [None]:
#Tuning parameters using Randomized Search CV 
param = {
    'simpleimputer__strategy': ['mean', 'median'],
    'randomforestclassifier__max_depth': range(10, 25, 2),
    'randomforestclassifier__n_estimators': range(100, 131, 5)}

model = RandomizedSearchCV(
    clf_rf,
    param_distributions = param,
    n_jobs = -1,
    cv = 5,
    verbose = 1,
    n_iter = 25
)

model.fit(X, y)

In [None]:
best_score = model.best_score_
best_params = model.best_params_

print('Best score for `model`:', best_score)
print('Best params for `model`:', best_params)

In [None]:
y_pred = model.predict(X_test)

submission = pd.DataFrame({'status_group' : y_pred}, index = X_test.index)