In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from  sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.stats import randint

# Read Data

In [3]:
training_data = pd.read_csv('cleanedTrainingData.csv')
training_labels = pd.read_csv('trainingsetlabels.csv')
test_data = pd.read_csv('cleanedTestData.csv')

# Preparation

In [4]:
# training_data.date_recorded = pd.to_datetime(training_data.date_recorded).dt.strftime("%Y%m%d")
# test_data.date_recorded = pd.to_datetime(test_data.date_recorded).dt.strftime("%Y%m%d")
training_data = training_data.drop('id', axis=1)
stringCols = training_data.select_dtypes(object)
testStrings = test_data.select_dtypes(object)

In [5]:
our_features =  [
   'longitude', 
   'latitude', 
   'region', 
   'region_code', 
   'district_code', 
   'lga', 
   'extraction_type', 
   'extraction_type_group', 
   'extraction_type_class', 
   'payment', 
   'payment_type', 
   'quantity', 
   'quantity_group', 
   'source_type', 
   'waterpoint_type', 
   'waterpoint_type_group', 
   'amount_tsh', 
   'source', 
   'source_class', 
   'gps_height', 
   'construction_year', 
   'date_recorded', 
   'subvillage', 
   'wpt_name', 
   'num_private', 
   'recorded_by', 
   'permit', 
   'management_group', 
   'quality_group', 
   'funder', 
   'installer', 
   'ward', 
   'population', 
   'scheme_name'
   ]

In [6]:
for feature in stringCols:
   training_data[feature] = training_data[feature].str.upper()
   test_data[feature] = test_data[feature].str.upper()

## Drop extra features from training set

In [7]:
training_data= training_data[our_features]

## Drop extra features from test set

In [8]:
test_data=test_data[our_features]

In [9]:
# scale.transform(df)

In [10]:
len(training_data.columns)

34

In [11]:
type(training_data.lga.values)

numpy.ndarray

In [12]:
train_len = len(training_data)
test_len = len (test_data)
le = LabelEncoder()
df = pd.concat([training_data,test_data])
scale = StandardScaler()
for feature in our_features:
   df[feature] = le.fit_transform(df[feature]).astype('str')
   df[feature] = scale.fit_transform(df[feature].array.reshape(-1,1))

## Check lengths for accuracy
encoded_train = df[:train_len]
encoded_test = df[train_len:]
print(train_len, len(encoded_train))
print(test_len, len(encoded_test))

59400 59400
14850 14850


## Convert Training features to appropriate types

In [13]:
# training_data['public_meeting']=training_data['public_meeting'].astype('boolean')
# training_data['permit']=training_data['permit'].astype('boolean')
# training_data['region_code']=training_data['region_code'].astype(object)
# training_data['funder']=training_data['funder'].astype(object)
# training_data['installer']=training_data['installer'].astype(object)
# training_data['basin']=training_data['basin'].astype(object)
# training_data['subvillage']=training_data['subvillage'].astype(object)
# training_data['lga']=training_data['lga'].astype(object)
# training_data['ward']=training_data['ward'].astype(object)
# training_data['scheme_management']=training_data['scheme_management'].astype(object)
# training_data['extraction_type']=training_data['extraction_type'].astype(object)
# training_data['management']=training_data['management'].astype(object)
# training_data['management_group']=training_data['management_group'].astype(object)
# # training_data['payment']=training_data['payment'].astype(object)
# training_data['payment_type']=training_data['payment_type'].astype(object)
# training_data['water_quality']=training_data['water_quality'].astype(object)
# training_data['quantity']=training_data['quantity'].astype(object)
# training_data['source']=training_data['source'].astype('category')
# training_data['waterpoint_type']=training_data['waterpoint_type'].astype(object)
# # training_data['wpt_name']=training_data['wpt_name'].astype(object)

## Convert Testing features to appropriate types

In [14]:
# test_data['public_meeting']=test_data['public_meeting'].astype('boolean')
# test_data['permit']=test_data['permit'].astype('boolean')
# test_data['region_code']=test_data['region_code'].astype('category')
# test_data['funder']=test_data['funder'].astype('category')
# test_data['installer']=test_data['installer'].astype('category')
# test_data['basin']=test_data['basin'].astype('category')
# test_data['subvillage']=test_data['subvillage'].astype('category')
# test_data['lga']=test_data['lga'].astype('category')
# test_data['ward']=test_data['ward'].astype('category')
# test_data['scheme_management']=test_data['scheme_management'].astype('category')
# test_data['extraction_type']=test_data['extraction_type'].astype('category')
# test_data['management']=test_data['management'].astype('category')
# test_data['management_group']=test_data['management_group'].astype('category')
# # test_data['payment']=test_data['payment'].astype('category')
# test_data['payment_type']=test_data['payment_type'].astype('category')
# test_data['water_quality']=test_data['water_quality'].astype('category')
# test_data['quantity']=test_data['quantity'].astype('category')
# test_data['source']=test_data['source'].astype('category')
# test_data['waterpoint_type']=test_data['waterpoint_type'].astype('category')
# # test_data['wpt_name']=test_data['wpt_name'].astype('category')

In [15]:
# dummied_training = pd.get_dummies(training_data, drop_first=True)

# training_data['public_meeting']=OneHotEncoder(training_data['public_meeting'])
# training_data['permit']=pd.OneHotEncoder(training_data['permit'])
# training_data['region_code']=pd.OneHotEncoder(training_data['region_code'])
# training_data['funder']=pd.get_dummies(training_data['funder'], drop_first=True)
# training_data['installer']=pd.get_dummies(training_data['installer'], drop_first=True)
# training_data['basin']=pd.get_dummies(training_data['basin'], drop_first=True)
# training_data['subvillage']=pd.get_dummies(training_data['subvillage'], drop_first=True)
# training_data['lga']=pd.get_dummies(training_data['lga'], drop_first=True)
# training_data['ward']=pd.get_dummies(training_data['ward'], drop_first=True)
# training_data['scheme_management']=pd.get_dummies(training_data['scheme_management'], drop_first=True)
# training_data['extraction_type']=pd.get_dummies(training_data['extraction_type'], drop_first=True)
# training_data['management']=pd.get_dummies(training_data['management'], drop_first=True)
# training_data['management_group']=pd.get_dummies(training_data['management_group'], drop_first=True)
# training_data['payment']=pd.get_dummies(training_data['payment'], drop_first=True)
# training_data['payment_type']=pd.get_dummies(training_data['payment_type'], drop_first=True)
# training_data['water_quality']=pd.get_dummies(training_data['water_quality'], drop_first=True)
# training_data['quantity']=pd.get_dummies(training_data['quantity'], drop_first=True)
# training_data['source']=pd.get_dummies(training_data['source'], drop_first=True)
# training_data['waterpoint_type']=pd.get_dummies(training_data['waterpoint_type'], drop_first=True)
# training_data['wpt_name']=pd.get_dummies(training_data['wpt_name'], drop_first=True)

Approximately half of the values in 'funder' and 'installer' have only 1 entry
* Maybe create new feature for each to reduce both to 2 sets per ([funder > 1: Bool], [installer > 1: Bool])

In [16]:
# import seaborn as sns
# sns.set_style('whitegrid')
# sns.countplot(x='funder_group', data=training_data)
(training_data.funder.value_counts()>1).value_counts(normalize=True)
# (training_data.installer.value_counts()>1).value_counts(normalize=True)

False    0.513713
True     0.486287
Name: funder, dtype: float64

In [17]:
training_data.groupby('funder').filter(lambda x: len(x) >0 and len(x) < 50000).funder.value_counts(normalize=True)
# training_data.funder.value_counts()

GOVERNMENT OF TANZANIA    0.152929
ROMAN                     0.078906
DANIDA                    0.052424
HESAWA                    0.037071
RWSSP                     0.023131
                            ...   
LOTTERY                   0.000017
RARYMOND EKURA            0.000017
JUSTINE MARWA             0.000017
MUNICIPAL COUNCIL         0.000017
SAMLO                     0.000017
Name: funder, Length: 1896, dtype: float64

In [18]:
# training_data.corr()
# dummied_training.corr()

In [19]:
# training_data.dtypes

In [20]:
# stringCols = training_data.select_dtypes(object)
# testStrings = test_data.select_dtypes('category')
# stringCols.columns

# Encoding
## Create list of columns

In [21]:
# features = training_data.columns
training_data.columns

Index(['longitude', 'latitude', 'region', 'region_code', 'district_code',
       'lga', 'extraction_type', 'extraction_type_group',
       'extraction_type_class', 'payment', 'payment_type', 'quantity',
       'quantity_group', 'source_type', 'waterpoint_type',
       'waterpoint_type_group', 'amount_tsh', 'source', 'source_class',
       'gps_height', 'construction_year', 'date_recorded', 'subvillage',
       'wpt_name', 'num_private', 'recorded_by', 'permit', 'management_group',
       'quality_group', 'funder', 'installer', 'ward', 'population',
       'scheme_name'],
      dtype='object')

## Create OneHotEncoder

In [22]:
# mergedata = training_data.append(test_data.drop('id',axis=1))
# testcount = len(test_data)
# count = len(mergedata)-testcount
# X_cat = mergedata.copy()
# X_cat = mergedata.select_dtypes(include=['category'])
# X_enc = X_cat.copy()

# X_enc = pd.get_dummies(X_enc, columns=X_cat.columns, drop_first=True)
# # mergedata = mergedata.drop(stringCols.columns, axis=1)
# FinalData = pd.concat([mergedata, X_enc], axis=1)
# encoded_train = FinalData[:count]
# encoded_test = FinalData[count:]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(encoded_train, training_labels, test_size=0.2, random_state=42)
y_train_id = y_train.copy()
y_test_id = y_test.copy()
y_train = y_train.drop('id',axis=1)
y_test = y_test.drop('id',axis=1)

## check if length and created columns are correct

In [24]:
# X_train['payment']

### MEMORY ERRORS
* After creating dummy features, the amount of columns in the dataframe increases from 40 to ~77,000
* One way to handle this may be to split the data into multiple smaller sets maybe 1,000-2,000 per
* One idea would be to decide the most important features to keep based on the smaller sets

In [25]:
# X_train.iloc[:1000,:]

In [52]:
clf = GradientBoostingClassifier(random_state=42)
params = {
   "loss": ['deviance','exponential'],
   "n_estimators": [100, 200, 300, 400, 500],
   # "n_jobs": -1,
   "max_features": ['sqrt', 'log2'],
   "min_samples_split": [2, 3, 5],
   'verbose': [1]}
gsh = GridSearchCV(clf, params)

In [53]:
# clf = RandomForestClassifier(criterion='entropy', max_features=10, n_estimators=400,
#                        n_jobs=-1, oob_score=True, random_state=42,
#                        warm_start=True)
# gsh.fit(X_train,y_train)
flat_y_train = np.ravel(y_train)
gsh.fit(X_train, flat_y_train)

      Iter       Train Loss   Remaining Time 
         1           0.8589            8.22s
         2           0.8355            8.18s
         3           0.8169            7.83s
         4           0.7988            7.54s
         5           0.7836            7.45s
         6           0.7737            7.33s
         7           0.7660            7.31s
         8           0.7573            7.15s
         9           0.7476            7.05s
        10           0.7401            6.89s
        20           0.6918            6.16s
        30           0.6659            5.34s
        40           0.6483            4.55s
        50           0.6367            3.76s
        60           0.6271            3.00s
        70           0.6189            2.24s
        80           0.6121            1.49s
        90           0.6061            0.75s
       100           0.6001            0.00s
      Iter       Train Loss   Remaining Time 
         1           0.8588            8.21s
        

150 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "E:\Users\Zac\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "E:\Users\Zac\anaconda3\lib\site-packages\sklearn\ensemble\_gb.py", line 525, in fit
    self._check_params()
  File "E:\Users\Zac\anaconda3\lib\site-packages\sklearn\ensemble\_gb.py", line 310, in _check_params
    self.loss_ = loss_class(self.n_classes_)
  File "E:\Users\Zac\anaconda3\lib\site-packages\sklearn\ensemble\_gb_losses.py", line 890, in __init__
    raise ValueError(
ValueError: ExponentialLo

      Iter       Train Loss   Remaining Time 
         1           0.8588           48.91s
         2           0.8359           50.31s
         3           0.8171           49.05s
         4           0.7989           47.75s
         5           0.7839           47.33s
         6           0.7738           47.19s
         7           0.7660           47.20s
         8           0.7572           46.75s
         9           0.7475           46.38s
        10           0.7400           45.58s
        20           0.6906           45.13s
        30           0.6651           43.91s
        40           0.6483           42.81s
        50           0.6365           41.61s
        60           0.6263           40.71s
        70           0.6179           39.86s
        80           0.6114           39.49s
        90           0.6054           38.65s
       100           0.6001           37.77s
       200           0.5650           28.15s
       300           0.5441           18.81s
       40

GridSearchCV(estimator=GradientBoostingClassifier(random_state=42),
             param_grid={'loss': ['deviance', 'exponential'],
                         'max_features': ['sqrt', 'log2'],
                         'min_samples_split': [2, 3, 5],
                         'n_estimators': [100, 200, 300, 400, 500],
                         'verbose': [1]})

In [55]:
best_clf = gsh.best_estimator_
# gsh.best_score_
# gsh.feature_names_in_

In [63]:
best_clf

GradientBoostingClassifier(max_features='sqrt', min_samples_split=5,
                           n_estimators=500, random_state=42, verbose=1)

In [56]:
best_clf.feature_importances_

array([0.0467724 , 0.02945074, 0.02075764, 0.01909107, 0.01135391,
       0.03557288, 0.02911998, 0.01429564, 0.04734929, 0.0249895 ,
       0.02233343, 0.18266588, 0.10811431, 0.008682  , 0.07203389,
       0.06123763, 0.0275814 , 0.01901717, 0.00403607, 0.023242  ,
       0.0623773 , 0.01877287, 0.00457944, 0.004867  , 0.00116393,
       0.        , 0.00322115, 0.01028475, 0.01196443, 0.02678208,
       0.01331945, 0.00962241, 0.01451076, 0.01083759])

In [None]:
# best_clf.fit(X_train, y_train)
# best_clf.feature_importances_

In [None]:
# import time
# import matplotlib.pyplot as plt

# start_time = time.time()
# importances = best_clf.feature_importances_
# std = np.std([tree.feature_importances_ for tree in best_clf.estimators_], axis=0)
# elapsed_time = time.time() - start_time

# print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")

# forest_importances = pd.Series(importances, index=our_features)

# fig, ax = plt.subplots()
# forest_importances.plot.bar(yerr=std, ax=ax)
# ax.set_title("Feature importances using MDI")
# ax.set_ylabel("Mean decrease in impurity")
# fig.tight_layout()

In [57]:
# val_pred = best_clf.predict(X_test)
val_pred = best_clf.predict(X_test)

In [58]:
error = np.mean(y_test != val_pred.reshape(-1,1))
accuracy = 1-error
accuracy

status_group    0.776178
dtype: float64

In [59]:
test_data_id = pd.read_csv('cleanedTestData.csv')
test_data_id = test_data_id['id']

In [61]:
test_pred = pd.Series(best_clf.predict(encoded_test))
test_dict = {'id': test_data_id,'status_group':test_pred}
test_pred_complete = pd.DataFrame(test_dict)
test_pred_complete.head()

Unnamed: 0,id,status_group
0,50785,functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional


In [62]:
test_pred_complete.to_csv('04112022.csv', index=False)