In [2]:
import pandas as pd
import numpy as np
import datetime
from pyproj import Transformer

df = pd.read_csv('./riista-utf8.csv')


# Enrichment steps

# convert the location data to longitude and latitude info
# 3067 is the EPSG code used by the orignal data
# 4326 is the Standard WGS84 longitude/latitude coords
transformer = Transformer.from_crs(3067, 4326)
df['lat'], df['lon'] = np.transpose(np.array([transformer.transform(x, y) for x, y in zip(df['x'], df['y'])]))


# write the lat & long info to a file so we can use them in a seprate script to call https://sunrise-sunset.org/api
# since we need to do 60k+ API calls, it will take some time. That's why it's being done on a separate server and not
# on this Jupyter notebook
df.to_csv('coordinates.csv', columns=['tapahtumaAika', 'lat', 'lon'], header=False)

# read back the results and join with the original data set
timeData = pd.read_csv('./daylight-api-output.txt', header=None, names=['id','tapahtumaAika', 'valoisuus'])
df['valoisuus'] = timeData['valoisuus'] * -1 # Reverse the mapping codes used in the API fetcher script


In [3]:
df

Unnamed: 0,id,tapahtumaAika,vuosi,kuukausi,x,y,kunta,kuntaNimi,maakunta,maakuntaNimi,tielaji,tielajis,tieNumero,tieYllapito,tieYllapitoNimi,riistalaji,riistalajiNimi,lat,lon,valoisuus
0,1,2017-12-10T16:55:00.000+02:00,2017,12,278773.65900,6.980878e+06,743,Seinäjoki,14,Etelä-Pohjanmaa,3,Valtatie,18.0,1.0,Valtio,47507,Metsäkauris,62.890515,22.646311,3
1,2,2017-09-03T13:00:00.000+03:00,2017,9,397501.00800,7.503084e+06,261,Kittilä,19,Lappi,4,Kantatie,80.0,1.0,Valtio,47503,Hirvi,67.625279,24.586614,0
2,3,2017-09-18T07:00:00.000+03:00,2017,9,418353.84100,7.514105e+06,261,Kittilä,19,Lappi,6,Muu maantie,9552.0,1.0,Valtio,47503,Hirvi,67.730620,25.069129,0
3,4,2017-12-01T17:00:00.000+02:00,2017,12,488970.82700,6.722504e+06,285,Kotka,8,Kymenlaakso,5,Seututie,357.0,1.0,Valtio,47629,Valkohäntäpeura,60.638182,26.798369,3
4,5,2017-09-18T07:00:00.000+03:00,2017,9,418766.58000,7.506931e+06,261,Kittilä,19,Lappi,6,Muu maantie,9552.0,1.0,Valtio,47503,Hirvi,67.666425,25.084131,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63937,63938,2021-10-04T09:30:00.000+03:00,2021,10,236501.85584,6.691606e+06,445,Parainen,2,Varsinais-Suomi,5,Seututie,180.0,1.0,Valtio,47507,Metsäkauris,60.275376,22.233479,0
63938,63939,2021-09-20T19:51:48.017+03:00,2021,9,515750.42581,6.946907e+06,778,Suonenjoki,11,Pohjois-Savo,3,Valtatie,9.0,1.0,Valtio,47503,Hirvi,62.652474,27.307304,3
63939,63940,2021-05-29T06:00:00.000+03:00,2021,5,245406.95714,6.712757e+06,202,Kaarina,2,Varsinais-Suomi,4,Kantatie,40.0,1.0,Valtio,47507,Metsäkauris,60.470338,22.367095,0
63940,63941,2021-04-13T14:30:00.000+03:00,2021,4,238690.96803,6.705560e+06,853,Turku,2,Varsinais-Suomi,7,Puuttuu,,99.0,Ei tietoa,47507,Metsäkauris,60.401653,22.254785,0


In [285]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

#model to predict what animal is the most likely to be encountered in a crash on given month and location

#drop unneeded dimensions, drop NaNs

data = df.drop(['id', 'tapahtumaAika','tieNumero', 'maakuntaNimi', 'vuosi', 'tielajis','x','y','lat','lon','maakunta', 'riistalajiNimi', 'kuntaNimi', 'tieYllapito', 'tieYllapitoNimi'], axis=1)
# 
data['kuukausi'] = data['kuukausi'].astype('category')
data['valoisuus'] = data['valoisuus'].astype('category')
data['kunta'] = data['kunta'].astype('category')
data['tielaji'] = data['tielaji'].astype('category')


data = data.dropna()

# extract the target category
target = data['riistalaji'].astype('category')
data = data.drop(['riistalaji'], axis=1)

data.info()

training_data, test_data, train_target, test_target = train_test_split(data, target, train_size=0.8)

# scaler = preprocessing.RobustScaler()
lr = LogisticRegression(max_iter = 5000, multi_class='multinomial', solver='saga')

#pipeline = Pipeline([('scaler', scaler), ('lr',lr)])
pipeline = Pipeline([('lr',lr)])

model = pipeline.fit(training_data, train_target)
predictedLr = model.predict(test_data)

print('LR  accuracy:', accuracy_score(test_target, predictedLr))




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63942 entries, 0 to 63941
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   kuukausi   63942 non-null  category
 1   kunta      63942 non-null  category
 2   tielaji    63942 non-null  category
 3   valoisuus  63942 non-null  category
dtypes: category(4)
memory usage: 323.8 KB
LR  accuracy: 0.4930799906169364


In [293]:

location = 'Kittilä'
kunta = df['kunta'].where(df['kuntaNimi'] == location).dropna().head(1).values[0]

for i in range(1,13):
    month = i
    data_to_predict = pd.DataFrame({'kuukausi': [month], 'kunta': [kunta], 'tielaji': [3], 'valoisuus': [-4]})
    p = lr.predict(data_to_predict)
    print('Kuukausi', month, df['riistalajiNimi'].where(df['riistalaji'] == p[0]).dropna().head(1).values[0])

    

Kuukausi 1 Metsäkauris
Kuukausi 2 Metsäkauris
Kuukausi 3 Metsäkauris
Kuukausi 4 Metsäkauris
Kuukausi 5 Metsäkauris
Kuukausi 6 Metsäkauris
Kuukausi 7 Valkohäntäpeura
Kuukausi 8 Valkohäntäpeura
Kuukausi 9 Valkohäntäpeura
Kuukausi 10 Valkohäntäpeura
Kuukausi 11 Valkohäntäpeura
Kuukausi 12 Valkohäntäpeura
