In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from  sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

## Encoder

In [2]:
#function to encode values in the dataset for easier computations
#Input: dataset, list of labels (empty if first time encoding, or previous labelList of encoding another dataset)
#Output: List of label:value mappings, encoded dataset
def myEncoder(labelList, data):

   encodedData = data.copy(deep=True)

   for col in range(0, len(encodedData.columns)):
      if(len(labelList) < 40):
         labelList.append({})
      uniqueVal = 0
      
      for row in range(0, len(encodedData)):
         current = encodedData.iloc[row,col]
         if(current not in labelList[col].keys()): #If the current value has not yet been mapped, map it
               labelList[col].update({current: uniqueVal})
               uniqueVal = uniqueVal + 1

         encodedData.iloc[row,col] = labelList[col][current] #Encode the data

   return labelList, encodedData

In [3]:
training = pd.read_csv('cleanedTrainingData.csv')
tLabels = pd.read_csv('trainingsetlabels.csv')
trueTest = pd.read_csv('cleanedTestData.csv')
labels = tLabels["status_group"]
training = training.join(labels)
training.date_recorded = pd.to_datetime(training.date_recorded).dt.strftime("%Y%m%d")
trueTest.date_recorded = pd.to_datetime(trueTest.date_recorded).dt.strftime("%Y%m%d")

stringCols = training.select_dtypes(object)
testStrings = trueTest.select_dtypes(object)
labelList = []
labelList, stringCols = myEncoder(labelList, stringCols)
labelList, testStrings = myEncoder(labelList, testStrings)

In [4]:
for cols in testStrings.columns:
   training.loc[:,cols] = stringCols.loc[:,cols]
   trueTest.loc[:,cols] = testStrings.loc[:,cols]
training.loc[:,"status_group"] = stringCols.loc[:,"status_group"]

tLabels = training.loc[:,"status_group"]
training = training.drop("status_group", axis = 1)

trainingData, testData, trainingLabels, testLabels = train_test_split(training, tLabels, test_size=0.2, random_state=42)

for col in trainingData:
   trainingData.loc[:, col] = trainingData.loc[:,col].astype("float")
   testData.loc[:, col] = testData.loc[:,col].astype("float")
   trueTest.loc[:, col] = trueTest.loc[:,col].astype("float")
trainingLabels = trainingLabels.astype("float")
testLabels = testLabels.astype("float")

In [5]:
scaler = StandardScaler()
scaler.fit(trainingData)
trainingData = scaler.transform(trainingData)
testData = scaler.transform(testData)
trueTest = scaler.transform(trueTest)

In [8]:
params = {'random_state':42,'bootstrap':True, 'criterion': 'entropy', 'max_features': 10, 'n_estimators':400, 'n_jobs':-1, 'oob_score':True, 'warm_start':True}
clf = RandomForestClassifier(random_state=42, bootstrap=True, criterion='entropy', max_features=10, n_estimators=400, n_jobs=-1, oob_score=True, warm_start=True)
clf.fit(trainingData,trainingLabels)
clf

RandomForestClassifier(criterion='entropy', max_features=10, n_estimators=400,
                       n_jobs=-1, oob_score=True, random_state=42,
                       warm_start=True)

## Predict on Test Data

In [10]:
testPredict = clf.predict(trueTest)
idcol = pd.read_csv('cleanedTestData.csv')
reversesStatus = dict((v,k) for k,v in labelList[28].items())
reverseID = dict((v,k) for k,v in labelList[0].items())
trueTest[0:,0] = trueTest[0:,0].astype("int")
testPredict = testPredict.astype("int")
df = pd.DataFrame()
ids = []
status = []
for row in range(0, len(trueTest)):
   ids.append(idcol.iloc[row,0])
   status.append(reversesStatus[testPredict[row]])
df["id"] = ids
df["status_group"] = status
df.to_csv("preds_03221022.csv", index=False)

In [11]:
df['id'].nunique

<bound method IndexOpsMixin.nunique of 0        50785
1        51630
2        17168
3        45559
4        49871
         ...  
14845    39307
14846    18990
14847    28749
14848    33492
14849    68707
Name: id, Length: 14850, dtype: int64>