In [1]:
# Import packages

import pandas as pd
import matplotlib.pyplot as plt
import math
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Get data

df = pd.read_csv('export_yardi_jourentline.csv', lineterminator='\n')

df = df.drop(columns=[' BUILDING', ' SPECIALCIRCUMSTANCE', ' USAGEAMOUNT', ' USAGETYPE', ' UNIT'])

df.head()

Unnamed: 0,GLCODE,GLNAME,PROPERTY,PROPERTYNAME,DATE,PERIOD,DESCRIPTION,CONTROL,REFERENCE,AMOUNT,DEBITCREDIT,BALANCE,REMARKS
0,1110-0000,Cash,ap-mc11,4203 11TH LLC,09/09/2020,12/01/2020,Webster S8 (t0034597),K-396581,165,8500.0,Credit,1813181.59,First buyout check
1,1110-0000,Cash,ap-mc11,4203 11TH LLC,09/29/2020,12/01/2020,Webster (t0034598),K-405215,173,2500.0,Credit,1810681.59,
2,1110-0000,Cash,ap-mc11,4203 11TH LLC,10/27/2020,12/01/2020,Webster S8 (t0034597),K-417819,183,9100.0,Credit,1801581.59,
3,1110-0000,Cash,ap-mc11,4203 11TH LLC,10/29/2020,12/01/2020,Jones (t0034602),K-420897,187,3500.0,Credit,1798081.59,First Buyout Check
4,1110-0000,Cash,ap-mc11,4203 11TH LLC,11/02/2020,12/01/2020,Dow (t0034603),K-421957,190,3500.0,Credit,1794581.59,First Buyout Check


In [3]:
# Basic column manipulation

df.columns = df.columns.map(lambda c: c.strip())

df['DATE'] = df['DATE'].map(lambda d: datetime.strptime(d, ' %m/%d/%Y'))
df['PERIOD'] = df['PERIOD'].map(lambda p: datetime.strptime(p, ' %m/%d/%Y'))

df['AMOUNT'] = df['AMOUNT'].map(lambda a: math.ceil(a))

In [4]:
# Find categorical columns

catCols = [col for col in df.columns if df[col].dtype == 'object' or df[col].dtype == 'datetime64[ns]']

In [5]:
# Label encode dataset

labelDf = df.copy()

labelEncoder = LabelEncoder()

for col in catCols:
    labelDf[col] = labelEncoder.fit_transform(df[col])
    
df = labelDf.copy()
print(df)

       GLCODE  GLNAME  PROPERTY  PROPERTYNAME  DATE  PERIOD  DESCRIPTION  \
0           0      27         0            25   147       0         1507   
1           0      27         0            25   157       0         1506   
2           0      27         0            25   174       0         1507   
3           0      27         0            25   176       0          981   
4           0      27         0            25   180       0          747   
...       ...     ...       ...           ...   ...     ...          ...   
96889     236     183        51            17   380       5          599   
96890     237     167         4            14   365       5          522   
96891     237     167        10             9   365       5          522   
96892     237     167        36            44   294       5          522   
96893     237     167        51            17   365       5          522   

       CONTROL  REFERENCE  AMOUNT  DEBITCREDIT  BALANCE  REMARKS  
0        11952      

In [6]:
# Define X and y for testing, and split

X = df.copy()
X = X.drop(['AMOUNT'], axis=1)
y = df['AMOUNT'].copy()

XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size=0.2, random_state=1)

XTrain = XTrain.reset_index(drop=True)
yTrain = yTrain.reset_index(drop=True)
XTest = XTest.reset_index(drop=True)
yTest = yTest.reset_index(drop=True)
print(XTrain)

       GLCODE  GLNAME  PROPERTY  PROPERTYNAME  DATE  PERIOD  DESCRIPTION  \
0          49     126        42             4   325       4          780   
1          81     119        35             1   234       0           13   
2           0      27        42             4   206       0          883   
3         215     215        33            26   218       0          509   
4          49     126        20            31   266       2          558   
...       ...     ...       ...           ...   ...     ...          ...   
77510       9       3        37            49   296       3         1222   
77511      46       1        37            49   354       4         1300   
77512      81     119        31            39   293       2           13   
77513       0      27        20            31   253       1          548   
77514      91     133        31            39   325       4          773   

       CONTROL  REFERENCE  DEBITCREDIT  BALANCE  REMARKS  
0        20137        966   

In [7]:
# Fit/predict model

model = RandomForestRegressor(n_estimators=1000)
model.fit(XTrain, yTrain)

preds = model.predict(XTest)

In [8]:
# Define scoring func. for regressor

def score(predictions, values):
    
    totalPercentOff = 0
    
    for i in range(0, len(predictions)):
        
        pred = predictions[i]
        v = values[i]
        
        difference = abs(pred - v)
        percentOff = difference * (100/v)
        #print(percentOff)
        
        totalPercentOff += percentOff
        
    avgPercentOff = totalPercentOff / len(predictions)
    print(avgPercentOff)
        

In [9]:
# Score test model

score(preds, yTest)


3604.251250938953


In [10]:
# Define outliers func.

def findOutliers(predictions, values):

    outlierIndexes = []
    inlierIndexes = []

    for i in range(0,len(predictions)):

        pred = predictions[i]
        v = values[i]

        if abs(pred - v) > pred/100:
            outlierIndexes.append(i)
        else:
            inlierIndexes.append(i)

    return outlierIndexes

In [11]:
# Find outliers for test model

testOutliers = findOutliers(preds, yTest)

print(len(testOutliers))

15889


In [12]:
# Concat train and test into full data

XFull = pd.concat([XTrain, XTest])
yFull = pd.concat([yTrain, yTest])

fullPreds = model.predict(XFull)

In [13]:
# Convert Series to List

fullPredsList = list(fullPreds)
yFullList = list(yFull)


In [14]:
# Score model

score(fullPreds, yFullList)

1677.9177425458145


In [15]:
# Find outliers for model

fullOutliers = findOutliers(fullPreds, yFullList)

print(len(fullOutliers))

73848


In [16]:
# Label data

labeledDf = df.copy()

outlierBoolList = []

for i in range(0, len(labeledDf)):
    
    valueCount = fullOutliers.count(i)
    
    if valueCount > 0:
        outlierBoolList.append(1)
        #print(fullPreds[i], yFullList[i])
    else:
        outlierBoolList.append(0)
        #print(fullPreds[i], yFullList[i])
        

labeledDf['OUTLIER'] = outlierBoolList  

print(len([x for x in labeledDf['OUTLIER'] if x == 1]))
print(labeledDf)
    

73848
       GLCODE  GLNAME  PROPERTY  PROPERTYNAME  DATE  PERIOD  DESCRIPTION  \
0           0      27         0            25   147       0         1507   
1           0      27         0            25   157       0         1506   
2           0      27         0            25   174       0         1507   
3           0      27         0            25   176       0          981   
4           0      27         0            25   180       0          747   
...       ...     ...       ...           ...   ...     ...          ...   
96889     236     183        51            17   380       5          599   
96890     237     167         4            14   365       5          522   
96891     237     167        10             9   365       5          522   
96892     237     167        36            44   294       5          522   
96893     237     167        51            17   365       5          522   

       CONTROL  REFERENCE  AMOUNT  DEBITCREDIT  BALANCE  REMARKS  OUTLIER  
0    

In [17]:
# Define cX and cy for testing, and split

cX = labeledDf.copy()
cX = cX.drop(['OUTLIER'], axis=1)

cy = labeledDf['OUTLIER']

cXTrain, cXTest, cyTrain, cyTest = train_test_split(cX, cy, test_size=0.2, random_state=1)

In [18]:
# Fit/predict cModel

cModel = DecisionTreeClassifier(random_state=0)
cModel.fit(cXTrain, cyTrain)

cPreds = cModel.predict(cXTest)

In [19]:
# Convert Series to List

cPredsList = list(cPreds)
cyTestList = list(cyTest)


In [20]:
# Define scoring func. for classifier

def cScore(predictions, values):
    
    correctIndexes = []
    incorrectIndexes = []
    
    for i in range(0, len(predictions)):
        
        prediction = int(predictions[i])
        value = int(values[i])
        
        if prediction == value:
            correctIndexes.append(i)
        else:
            incorrectIndexes.append(i)
            
    numCorrect = len(correctIndexes)
    numIncorrect = len(incorrectIndexes)
    
    percentCorrect = round(numCorrect * (100/len(predictions)))
    print(percentCorrect, '%')
    

In [21]:
# Score classifier

cScore(cPredsList, cyTestList)

63 %
