In [1]:
import pandas as pd
import numpy as np
import math
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('/Users/dhruv/code/export_yardi_jourentline.csv', lineterminator='\n')
df.head()


Unnamed: 0,GLCODE,GLNAME,PROPERTY,PROPERTYNAME,UNIT,BUILDING,SPECIALCIRCUMSTANCE,USAGEAMOUNT,USAGETYPE,DATE,PERIOD,DESCRIPTION,CONTROL,REFERENCE,AMOUNT,DEBITCREDIT,BALANCE,REMARKS
0,1110-0000,Cash,ap-mc11,4203 11TH LLC,42031,,,,,09/09/2020,12/01/2020,Webster S8 (t0034597),K-396581,165,8500.0,Credit,1813181.59,First buyout check
1,1110-0000,Cash,ap-mc11,4203 11TH LLC,42032,,,,,09/29/2020,12/01/2020,Webster (t0034598),K-405215,173,2500.0,Credit,1810681.59,
2,1110-0000,Cash,ap-mc11,4203 11TH LLC,42031,,,,,10/27/2020,12/01/2020,Webster S8 (t0034597),K-417819,183,9100.0,Credit,1801581.59,
3,1110-0000,Cash,ap-mc11,4203 11TH LLC,42052,,,,,10/29/2020,12/01/2020,Jones (t0034602),K-420897,187,3500.0,Credit,1798081.59,First Buyout Check
4,1110-0000,Cash,ap-mc11,4203 11TH LLC,42053,,,,,11/02/2020,12/01/2020,Dow (t0034603),K-421957,190,3500.0,Credit,1794581.59,First Buyout Check


In [3]:
df = df.drop(columns=[' BUILDING', ' SPECIALCIRCUMSTANCE', ' USAGEAMOUNT', ' USAGETYPE', ' UNIT', ' BALANCE', ' CONTROL', ' REFERENCE', ' REMARKS', ' DESCRIPTION'])

In [4]:
df.columns = df.columns.map(lambda c: c.strip())

df['DATE'] = df['DATE'].map(lambda d: datetime.strptime(d, ' %m/%d/%Y'))
df['PERIOD'] = df['PERIOD'].map(lambda p: datetime.strptime(p, ' %m/%d/%Y'))

df['AMOUNT'] = df['AMOUNT'].map(lambda a: math.ceil(a))


In [5]:
catCols = [col for col in df.columns if df[col].dtype == 'object' or df[col].dtype == 'datetime64[ns]']


In [6]:
for col in catCols:
    print(col, df[col].nunique())

GLCODE 238
GLNAME 234
PROPERTY 56
PROPERTYNAME 54
DATE 388
PERIOD 6
DEBITCREDIT 2


In [7]:
labelDf = df.copy()

labelEncoder = LabelEncoder()

for col in catCols:
    labelDf[col] = labelEncoder.fit_transform(df[col])
    
df = labelDf.copy()

print(df)


       GLCODE  GLNAME  PROPERTY  PROPERTYNAME  DATE  PERIOD  AMOUNT  \
0           0      27         0            25   147       0    8500   
1           0      27         0            25   157       0    2500   
2           0      27         0            25   174       0    9100   
3           0      27         0            25   176       0    3500   
4           0      27         0            25   180       0    3500   
...       ...     ...       ...           ...   ...     ...     ...   
96889     236     183        51            17   380       5    9940   
96890     237     167         4            14   365       5    2712   
96891     237     167        10             9   365       5     350   
96892     237     167        36            44   294       5    3803   
96893     237     167        51            17   365       5     175   

       DEBITCREDIT  
0                0  
1                0  
2                0  
3                0  
4                0  
...            ...  


In [8]:
X = df.copy()
X = X.drop(['AMOUNT'], axis=1)

y = df['AMOUNT']

XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size=0.2, random_state=1) # THIS MESSES UP INDEXES

XTrain = XTrain.reset_index(drop=True)
yTrain = yTrain.reset_index(drop=True)
XTest = XTest.reset_index(drop=True)
yTest = yTest.reset_index(drop=True)


In [9]:
model = RandomForestRegressor(n_estimators=100)
model.fit(XTrain, yTrain)

preds = pd.Series(model.predict(XTest))

In [10]:
print(model.score(XTrain, yTrain))
print(model.score(XTest, yTest))

0.8609228565234
0.493258038727427


In [11]:
comparison = pd.concat([preds, yTest], axis=1)
comparison.columns = ['preds', 'yTest']
print(comparison)

             preds  yTest
0      1544.780659    177
1        94.590349    175
2       168.325980      4
3       856.485833    863
4       347.287976    641
...            ...    ...
19374   532.751167    508
19375   580.485641      4
19376   392.948750   1050
19377     2.962583      2
19378  1148.436167   1269

[19379 rows x 2 columns]
