In [15]:
import sklearn as skl
import pandas as pd
import plotly.express as px
from pathlib import Path
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [16]:
propertyPricePath = Path.cwd().parent.parent/'public' / 'prop_mtl.csv'
pricedf = pd.read_csv(propertyPricePath)

In [17]:
print(pricedf)

        latitud   longitud    price  beds
0     45.403712 -73.939915  4880000     5
1     45.403712 -73.939915  3895000     5
2     45.403712 -73.939915  1900000     4
3     45.403712 -73.939915  1725000     4
4     45.403743 -73.950018   800000     3
...         ...        ...      ...   ...
2991  45.699600 -73.483954   294900     2
2992  45.699600 -73.483954   284900     2
2993  45.699600 -73.483954   138000     1
2994  45.699949 -73.483657   639000     4
2995  45.699949 -73.483657   589000     3

[2996 rows x 4 columns]


In [18]:
X = pricedf.loc[:, ['latitud', 'longitud']]
df_y = pricedf['price'] / pricedf['beds']

In [19]:
X = np.array(X)
y = np.array(df_y)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1234)

In [21]:
kNN = KNeighborsRegressor(n_neighbors=4)

kNN.fit(X_train, y_train)

In [22]:
linreg = LinearRegression().fit(X_train, y_train)

In [23]:
treereg = DecisionTreeRegressor(random_state=1234).fit(X_train, y_train)

In [24]:
print(f"kNN: {kNN.score(X_test, y_test)}")

print(f"Linear: {linreg.score(X_test, y_test)}")

print(f"Decision Tree: {treereg.score(X_test, y_test)}")

kNN: 0.3207993555672771
Linear: 0.06543637707961047
Decision Tree: 0.05709056197925455


In [25]:
criminalLogsPath = Path.cwd().parent.parent /'public'/ 'actes-criminels.csv'
crimedf = pd.read_csv(criminalLogsPath)
crimedf = crimedf.dropna(subset=['LONGITUDE'])
crimedf = crimedf.dropna(subset=['LATITUDE'])

def getMonth(str):
    return str.split('-')[1]
def getYear(str):
    return str.split('-')[0]
def getDay(str):
    return str.split('-')[2]
crimedf['YEAR'] = crimedf['DATE'].apply(getYear)
crimedf['MONTH'] = crimedf['DATE'].apply(getMonth)
crimedf['DAY'] = crimedf['DATE'].apply(getDay)

crimedf = crimedf.loc[:, ['CATEGORIE', 'QUART', 'PDQ', 'YEAR', 'MONTH', 'DAY', 'LONGITUDE', 'LATITUDE']]

y = crimedf['CATEGORIE']
X = crimedf.loc[:, ['YEAR', 'MONTH', 'DAY', 'QUART', 'PDQ', 'LONGITUDE', 'LATITUDE']]

X['JOUR'] = crimedf['QUART'].apply(lambda x : x == 'jour')
X['SOIR'] = crimedf['QUART'].apply(lambda x : x == 'soir')
X['NUIT'] = crimedf['QUART'].apply(lambda x : x == 'nuit')

del X['QUART']

uniques = X['PDQ'].unique()

for pdq in uniques:

    X[str(pdq)] = X['PDQ'].apply(lambda x : x == pdq)

del X['PDQ']

In [26]:
newX = X.copy()

# define a function that takes two arguments
def foo(a, b):
    return kNN.predict([[b, a]])[0]

# apply the function to columns A and B using a lambda function
newX['PRICE/ROOMS'] = newX.apply(lambda row: foo(row['LONGITUDE'], row['LATITUDE']), axis=1)

# print the updated DataFrame
print(newX)

        YEAR MONTH DAY  LONGITUDE   LATITUDE   JOUR   SOIR   NUIT   30.0  \
0       2018    09  13 -73.626778  45.567780   True  False  False   True   
1       2018    04  30 -73.626778  45.567780   True  False  False   True   
2       2018    09  01 -73.685928  45.519122  False  False   True  False   
6       2017    07  30 -73.591457  45.516776   True  False  False  False   
8       2017    08  01 -73.635117  45.602873   True  False  False  False   
...      ...   ...  ..        ...        ...    ...    ...    ...    ...   
247774  2023    01  25 -73.625616  45.528455  False  False   True  False   
247775  2023    01  25 -73.616734  45.617219  False  False   True  False   
247776  2023    01  25 -73.787533  45.482397  False  False   True  False   
247778  2023    01  25 -73.718913  45.505551  False  False   True  False   
247779  2023    01  25 -73.558643  45.532059  False  False   True  False   

          7.0  ...    3.0   11.0   10.0   20.0   15.0   26.0   13.0   55.0  \
0       F

In [27]:
print(newX.iloc[2996])

YEAR                  2015
MONTH                   01
DAY                     25
LONGITUDE       -73.747285
LATITUDE         45.456239
JOUR                 False
SOIR                  True
NUIT                 False
30.0                 False
7.0                  False
38.0                 False
39.0                 False
5.0                  False
21.0                 False
22.0                 False
48.0                 False
12.0                 False
8.0                  False
16.0                 False
35.0                 False
31.0                 False
44.0                 False
27.0                 False
33.0                 False
9.0                  False
4.0                  False
45.0                 False
23.0                 False
24.0                 False
49.0                 False
42.0                 False
46.0                 False
1.0                  False
3.0                  False
11.0                 False
10.0                 False
20.0                 False
1

In [29]:
newX.to_csv('data/randomForestTrainingData.csv')