In [1]:
import pandas as pd
import numpy as np
import math
import time
from datetime import datetime
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


In [2]:
df = pd.read_csv('/Users/dhruv/code/export_yardi_jourentline.csv', lineterminator='\n')
df.head()

Unnamed: 0,GLCODE,GLNAME,PROPERTY,PROPERTYNAME,UNIT,BUILDING,SPECIALCIRCUMSTANCE,USAGEAMOUNT,USAGETYPE,DATE,PERIOD,DESCRIPTION,CONTROL,REFERENCE,AMOUNT,DEBITCREDIT,BALANCE,REMARKS
0,1110-0000,Cash,ap-mc11,4203 11TH LLC,42031,,,,,09/09/2020,12/01/2020,Webster S8 (t0034597),K-396581,165,8500.0,Credit,1813181.59,First buyout check
1,1110-0000,Cash,ap-mc11,4203 11TH LLC,42032,,,,,09/29/2020,12/01/2020,Webster (t0034598),K-405215,173,2500.0,Credit,1810681.59,
2,1110-0000,Cash,ap-mc11,4203 11TH LLC,42031,,,,,10/27/2020,12/01/2020,Webster S8 (t0034597),K-417819,183,9100.0,Credit,1801581.59,
3,1110-0000,Cash,ap-mc11,4203 11TH LLC,42052,,,,,10/29/2020,12/01/2020,Jones (t0034602),K-420897,187,3500.0,Credit,1798081.59,First Buyout Check
4,1110-0000,Cash,ap-mc11,4203 11TH LLC,42053,,,,,11/02/2020,12/01/2020,Dow (t0034603),K-421957,190,3500.0,Credit,1794581.59,First Buyout Check


In [3]:
# ***** PREPROCESSING *****

In [4]:
df.columns = df.columns.map(lambda c: c.strip())

cols = ['GLCODE', 'PROPERTY', 'UNIT', 'DATE', 'PERIOD', 'AMOUNT', 'DEBITCREDIT', 'DESCRIPTION', 'REMARKS']
df = df[cols]

df['REMARKS'] = df['REMARKS'].map(lambda r: r.lower())

df.head()


Unnamed: 0,GLCODE,PROPERTY,UNIT,DATE,PERIOD,AMOUNT,DEBITCREDIT,DESCRIPTION,REMARKS
0,1110-0000,ap-mc11,42031,09/09/2020,12/01/2020,8500.0,Credit,Webster S8 (t0034597),first buyout check
1,1110-0000,ap-mc11,42032,09/29/2020,12/01/2020,2500.0,Credit,Webster (t0034598),
2,1110-0000,ap-mc11,42031,10/27/2020,12/01/2020,9100.0,Credit,Webster S8 (t0034597),
3,1110-0000,ap-mc11,42052,10/29/2020,12/01/2020,3500.0,Credit,Jones (t0034602),first buyout check
4,1110-0000,ap-mc11,42053,11/02/2020,12/01/2020,3500.0,Credit,Dow (t0034603),first buyout check


In [5]:
X = df.copy()
X = X.drop(['AMOUNT'], axis=1)

y = df['AMOUNT']

preXTrain, preXTest, yTrain, yTest = train_test_split(X, y, test_size=0.15, random_state=1)

preXTrain = preXTrain.reset_index(drop=True)
yTrain = yTrain.reset_index(drop=True)
preXTest = preXTest.reset_index(drop=True)
yTest = yTest.reset_index(drop=True)


In [6]:
catCols = [col for col in X.columns if X[col].dtype == 'object']


In [7]:
labelXTrain = preXTrain.copy()
labelXTest = preXTest.copy()

le = LabelEncoder()

for col in catCols:
    le.fit(labelXTrain[col])
    
    labelXTest[col] = labelXTest[col].map(lambda s: '<unknown>' if s not in le.classes_ else s)
    
    le.classes_ = np.append(le.classes_, '<unknown>')
    
    labelXTrain[col] = le.transform(labelXTrain[col])
    labelXTest[col] = le.transform(labelXTest[col])
    
XTrain = labelXTrain.copy()
XTest = labelXTest.copy()


In [8]:
# ***** MODEL CREATION *****

In [9]:
model = RandomForestRegressor(n_estimators=40, random_state=0)
model.fit(XTrain, yTrain)

testPreds = pd.Series(model.predict(XTest))
testPreds = testPreds.map(lambda pred: round(pred, 2))


In [10]:
# ***** MODEL EVALUATION *****

In [11]:
acceptablePercent = 5  
percentsList = []

predHighCount = 0
predLowCount = 0

for i in range(0, len(yTest)):
    
    real = yTest[i]
    pred = testPreds[i]
    
    percentOfReal = (pred/real)*100
    percentOff = abs(100-percentOfReal)
    
    percentsList.append(round(percentOff, 2))
    
    
    if pred > real:
        predHighCount = predHighCount+1
    else:
        predLowCount = predLowCount+1
    

goodPreds = [x for x in percentsList if x < acceptablePercent]
badPreds = [x for x in percentsList if x > acceptablePercent]

print(len(goodPreds), len(badPreds))
print(predHighCount, predLowCount)


4830 9687
7145 7390


In [34]:
percentsDf = pd.DataFrame(percentsList, columns=['percentOff'])
percentsDf.head()

def fn(x):
    if (x>100): return '>100'
    return '<100'

    
print(percentsDf.columns)

percentsDf['_category'] = percentsDf.apply(lambda p: fn(p['percentOff']), axis=1)
percentsDf[['percentOff', '_category']]








Index(['percentOff'], dtype='object')


Unnamed: 0,_category
0,<100
1,<100
2,>100
3,<100
4,<100
...,...
14530,<100
14531,>100
14532,<100
14533,<100


In [13]:
comparison = pd.concat([yTest, testPreds], axis=1)
comparison.columns = ['actual', 'predicted']
print(comparison)


       actual  predicted
0      176.07     290.50
1      174.03      84.49
2        3.61     228.91
3      863.00     665.27
4      640.66     935.37
...       ...        ...
14530   70.00      68.53
14531    0.36       1.04
14532   47.19      43.64
14533    3.60       3.60
14534  225.00     246.24

[14535 rows x 2 columns]


In [14]:
print('R2:', model.score(XTest, yTest))
print('MAE:', metrics.mean_absolute_error(yTest, testPreds))
print('RMSE:', metrics.mean_squared_error(yTest, testPreds, squared=False))
print('MAPE:', np.mean(np.abs((yTest - testPreds) / np.abs(yTest))))


R2: 0.5071391110595351
MAE: 1606.129045751634
RMSE: 32379.78953166789
MAPE: 382.04661002199833
