In [None]:
import pandas as pd
import numpy as np
import math
import time
from datetime import datetime
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


In [None]:
# TODO

# change the date and period to an int (either month and year as seperate columns or a timestamp)
# experiment with column combinations more
# experiment with training on whole dataset

In [None]:
df = pd.read_csv('/Users/dhruv/code/export_yardi_jourentline.csv', lineterminator='\n')
df.head()

In [None]:
# ***** PREPROCESSING *****

In [None]:
df.columns = df.columns.map(lambda c: c.strip())

cols = ['GLCODE', 'PROPERTY', 'UNIT', 'DATE', 'AMOUNT', 'DEBITCREDIT', 'DESCRIPTION', 'REMARKS']
df = df[cols]

df['DATE'] = df['DATE'].map(lambda d: datetime.strptime(d, ' %m/%d/%Y'))

df['DATEDAY'] = df['DATE'].map(lambda d: int(d.day))
df['DATEMO'] = df['DATE'].map(lambda d: int(d.month))
df['DATEYR'] = df['DATE'].map(lambda d: int(d.year))

df = df.drop(columns=['DATE'], axis=1)

df['DEBIT'] = df['DEBITCREDIT'].map(lambda d: 1 if d == 'Debit' else 0)
df['CREDIT'] = df['DEBITCREDIT'].map(lambda c: 1 if c == 'Credit' else 0)

df['REMARKS'] = df['REMARKS'].map(lambda r: r.lower())

df.head()


In [None]:
column = 'DEBITCREDIT'

plt.figure(figsize=(10, 6))
plt.title(column)

sns.scatterplot(x=df[column], y=df['AMOUNT'])


In [None]:
X = df.copy()
X = X.drop(['AMOUNT'], axis=1)

y = df['AMOUNT']

preXTrain, preXTest, yTrain, yTest = train_test_split(X, y, test_size=0.15, random_state=1)

#preXTrain = X
#yTrain = y
#preXTest = X
#yTest = y

preXTrain = preXTrain.reset_index(drop=True)
yTrain = yTrain.reset_index(drop=True)
preXTest = preXTest.reset_index(drop=True)
yTest = yTest.reset_index(drop=True)


In [None]:
catCols = [col for col in X.columns if X[col].dtype == 'object']


In [None]:
labelXTrain = preXTrain.copy()
labelXTest = preXTest.copy()

le = LabelEncoder()

for col in catCols:
    le.fit(labelXTrain[col])
    
    labelXTest[col] = labelXTest[col].map(lambda s: '<unknown>' if s not in le.classes_ else s)
    
    le.classes_ = np.append(le.classes_, '<unknown>')
    
    labelXTrain[col] = le.transform(labelXTrain[col])
    labelXTest[col] = le.transform(labelXTest[col])
    
XTrain = labelXTrain.copy()
XTest = labelXTest.copy()


In [None]:
# ***** MODEL CREATION *****

In [None]:
model = RandomForestRegressor(n_estimators=35, random_state=0)
model.fit(XTrain, yTrain)

testPreds = pd.Series(model.predict(XTest))
testPreds = testPreds.map(lambda pred: round(pred, 2))

In [None]:
# ***** MODEL EVALUATION *****

In [None]:
comparison = pd.concat([yTest, testPreds], axis=1)
comparison.columns = ['actual', 'predicted']
print(comparison.iloc[0:25])


In [None]:
percentsList = []

for i in range(0, len(yTest)):
    
    real = yTest[i]
    pred = testPreds[i]
    
    percentOfReal = (pred/real)*100
    percentOff = abs(100-percentOfReal)
    
    percentsList.append(round(percentOff, 2))

percentsDf = pd.DataFrame(percentsList, columns=['percentOff'])
percentsDf.head()

def categorizePercents(p):
    if p>=0 and p<5:
        return 5
    elif p>=5 and p<10:
        return 10
    elif p>=10 and p<20:
        return 20
    elif p>=20 and p<50:
        return 50
    elif p>=50 and p<100:
        return 100
    elif p>=100 and p<200:
        return 200
    elif p>=200 and p<500:
        return 500
    elif p>=500 and p<1000:
        return 1000
    elif p>=1000:
        return 10000
    else:
        return 'unknown'


percentsDf['category'] = percentsDf.apply(lambda p: categorizePercents(p['percentOff']), axis=1)

percentRangeDf = percentsDf.groupby('category').size().reset_index(name='count')
percentRangeDf['percentOfTotal'] = percentRangeDf['count']/len(percentsDf.index)*100

print(percentRangeDf)


In [None]:
plt.figure(figsize=(10,6))
plt.title('Bar Chart of %Error Counts')

sns.barplot(x=percentRangeDf['category'], y=percentRangeDf['count'])


In [None]:
print('R2:', metrics.r2_score(yTest, testPreds))
print('MAE:', metrics.mean_absolute_error(yTest, testPreds))
print('RMSE:', metrics.mean_squared_error(yTest, testPreds, squared=False))
