In [112]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_selection import RFECV

## Encoder

In [64]:
#function to encode values in the dataset for easier computations
#Input: dataset, list of labels (empty if first time encoding, or previous labelList of encoding another dataset)
#Output: List of label:value mappings, encoded dataset
def myEncoder(labelList, data):

   encodedData = data.copy(deep=True)

   for col in range(0, len(encodedData.columns)):
      if(len(labelList) < 40):
         labelList.append({})
      uniqueVal = 0
      
      for row in range(0, len(encodedData)):
         current = encodedData.iloc[row,col]
         if(current not in labelList[col].keys()): #If the current value has not yet been mapped, map it
               labelList[col].update({current: uniqueVal})
               uniqueVal = uniqueVal + 1

         encodedData.iloc[row,col] = labelList[col][current] #Encode the data

   return labelList, encodedData

## Read in Data

In [65]:
training = pd.read_csv('cleanedTrainingData.csv')
tLabels = pd.read_csv('trainingsetlabels.csv')
tr_tst_no_id = pd.read_csv('cleanedTestData.csv')
labels = tLabels["status_group"]


In [66]:
select_feats = [
   "longitude",
   "latitude",
   "region", 
   "region_code", 
   "district_code", 
   "lga", 
   "extraction_type", 
   "extraction_type_group", 
   "extraction_type_class", 
   "payment", 
   "payment_type", 
   "quantity", 
   "quantity_group", 
   "source_type", 
   "waterpoint_type", 
   "waterpoint_type_group", 
   "amount_tsh", 
   "source", 
   "source_class", 
   "gps_height", 
   "construction_year", 
   "date_recorded", 
   "subvillage", 
   "wpt_name"
   ]

In [67]:
tr_no_id = training[select_feats]
tr_tst_no_id = tr_tst_no_id[select_feats]
tr_no_id = tr_no_id.join(labels)
tr_no_id.date_recorded = pd.to_datetime(tr_no_id.date_recorded).dt.strftime("%Y%m%d")
tr_tst_no_id.date_recorded = pd.to_datetime(tr_tst_no_id.date_recorded).dt.strftime("%Y%m%d")

stringColsNoId = tr_no_id.select_dtypes(object)
testStringsNoId = tr_tst_no_id.select_dtypes(object)
labelListNoId = []
labelListNoId, stringColsNoId = myEncoder(labelListNoId, stringColsNoId)
labelListNoId, testStringsNoId = myEncoder(labelListNoId, testStringsNoId)

In [68]:
for cols in testStringsNoId.columns:
   tr_no_id.loc[:,cols] = stringColsNoId.loc[:,cols]
   tr_tst_no_id.loc[:,cols] = testStringsNoId.loc[:,cols]
tr_no_id.loc[:,"status_group"] = stringColsNoId.loc[:,"status_group"]

tLabels = tr_no_id.loc[:,"status_group"]
tr_no_id = tr_no_id.drop("status_group", axis = 1)

trainingData, testData, trainingLabels, testLabels = train_test_split(tr_no_id, tLabels, test_size=0.2, random_state=42)

for col in trainingData:
   trainingData.loc[:, col] = trainingData.loc[:,col].astype("float")
   testData.loc[:, col] = testData.loc[:,col].astype("float")
   tr_tst_no_id.loc[:, col] = tr_tst_no_id.loc[:,col].astype("float")
trainingLabels = trainingLabels.astype("float")
testLabels = testLabels.astype("float")

In [69]:
scaler = StandardScaler()
scaler.fit(trainingData)
trainingData = scaler.transform(trainingData)
testData = scaler.transform(testData)
tr_tst_no_id = scaler.transform(tr_tst_no_id)

In [8]:
# params = {'random_state':42,'bootstrap':True, 'criterion': 'entropy', 'max_features': 10, 'n_estimators':400, 'n_jobs':-1, 'oob_score':True, 'warm_start':True}
# clf = RandomForestClassifier(random_state=42, bootstrap=True, criterion='entropy', max_features=10, n_estimators=400, n_jobs=-1, oob_score=True, warm_start=True)
# clf.fit(trainingData,trainingLabels)
# clf

RandomForestClassifier(criterion='entropy', max_features=10, n_estimators=400,
                       n_jobs=-1, oob_score=True, random_state=42,
                       warm_start=True)

In [108]:
# clf = Pipeline([
#   ('feature_selection', RFECV(LinearSVC(),n_jobs=-1)),
#   ('classification', RandomForestClassifier(random_state=42, bootstrap=True, criterion='entropy', n_estimators=400, n_jobs=-1, oob_score=True, warm_start=True))
# ])
# clf.fit(trainingData, trainingLabels)



Pipeline(steps=[('feature_selection',
                 RFECV(estimator=LinearSVC(), min_features_to_select=10,
                       n_jobs=-1)),
                ('classification',
                 RandomForestClassifier(criterion='entropy', max_features=10,
                                        n_estimators=400, n_jobs=-1,
                                        oob_score=True, random_state=42,
                                        warm_start=True))])

In [133]:
clf = Pipeline([
  ('feature_selection', RFECV(LinearSVC(),n_jobs=-1)),
  ('classification', AdaBoostClassifier(learning_rate=0.5, random_state=42))
])
clf.fit(trainingData, trainingLabels)



Pipeline(steps=[('feature_selection', RFECV(estimator=LinearSVC(), n_jobs=-1)),
                ('classification',
                 AdaBoostClassifier(learning_rate=0.5, random_state=42))])

In [137]:
clf[1]._final_estimator

AttributeError: 'AdaBoostClassifier' object has no attribute '_final_estimator'

In [134]:
test_pred = clf.predict(testData)

## AdaBoost learning_rate=0.5 validation accuracy

In [136]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(np.array(test_pred),np.array(testLabels))

0.596149584196153

## Random Forest Results
### default params and features from top overlapping used for initial reduction, further reduced in Pipeline with RFECV

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(np.array(test_pred),np.array(testLabels))

0.789983164983165

## RandomForest Results
### params from previous GridsearchCV run and top overlapping features

In [110]:
from sklearn.metrics import accuracy_score

accuracy_score(np.array(test_pred),np.array(testLabels))

0.8036195286195286

### AdaBoost Results

In [116]:
from sklearn.metrics import accuracy_score

accuracy_score(np.array(test_pred),np.array(testLabels))

0.7166666666666667

## Predict on Test Data

In [111]:
testPredict = clf.predict(tr_tst_no_id)
idcol = pd.read_csv('cleanedTestData.csv')
# reversesStatus = dict((v,k) for k,v in labelListNoId[28].items())
reversesStatus = dict((v,k) for k,v in labelListNoId[17].items())
reverseID = dict((v,k) for k,v in labelListNoId[0].items())
tr_tst_no_id[0:,0] = tr_tst_no_id[0:,0].astype("int")
testPredict = testPredict.astype("int")
df = pd.DataFrame()
ids = []
status = []
for row in range(0, len(tr_tst_no_id)):
   ids.append(idcol.iloc[row,0])
   status.append(reversesStatus[testPredict[row]])
df["id"] = ids
df["status_group"] = status
df.to_csv("gridsearchcv_params_select_feats_preds_03272022.csv", index=False)

In [103]:
labelListNoId[17]

{'functional': 0, 'non functional': 1, 'functional needs repair': 2}

In [97]:
reversesStatus

{}