In [1]:
import pickle
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

from datetime import timedelta

from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression

In [2]:
PROCESSED_DATA_FOLDER = "data/4_all_data_preprocessed"
DATASET_FILE = "all_features"

In [4]:
# Load dataset
dataset = pickle.load(open(f"{PROCESSED_DATA_FOLDER}/{DATASET_FILE}.pkl", "rb"))

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 192867 entries, 0 to 192866
Columns: 7323 entries, city_resolvedAddress to isw_zyuganov
dtypes: Sparse[float64, 0](7281), bool(1), float64(38), object(3)
memory usage: 1.3+ GB


In [6]:
# check the size of the data
dataset.shape

(192867, 7323)

In [7]:
# check few lines
dataset.head()

Unnamed: 0,city_resolvedAddress,day_datetime,isw_date_tomorrow_datetime,event_indicator,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,...,isw_zolot,isw_zolota,isw_zolotarivka,isw_zone,isw_zorya,isw_ztpp,isw_zvanivka,isw_zvezda,isw_zybinski,isw_zyuganov
0,"Луцьк, Луцький район, Україна",2022-02-25,2022-02-25,False,6.3,-0.3,2.2,-1.2,78.3,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Луцьк, Луцький район, Україна",2022-02-25,2022-02-25,False,6.3,-0.3,2.2,-1.2,78.3,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Луцьк, Луцький район, Україна",2022-02-25,2022-02-25,False,6.3,-0.3,2.2,-1.2,78.3,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Луцьк, Луцький район, Україна",2022-02-25,2022-02-25,False,6.3,-0.3,2.2,-1.2,78.3,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Луцьк, Луцький район, Україна",2022-02-25,2022-02-25,False,6.3,-0.3,2.2,-1.2,78.3,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# check nan
dataset.isna().sum()

city_resolvedAddress          0
day_datetime                  0
isw_date_tomorrow_datetime    0
event_indicator               0
day_tempmax                   0
                             ..
isw_ztpp                      0
isw_zvanivka                  0
isw_zvezda                    0
isw_zybinski                  0
isw_zyuganov                  0
Length: 7323, dtype: int64

In [14]:
dataset.dtypes

city_resolvedAddress                      object
day_datetime                              object
isw_date_tomorrow_datetime                object
event_indicator                             bool
day_tempmax                              float64
                                     ...        
isw_ztpp                      Sparse[float64, 0]
isw_zvanivka                  Sparse[float64, 0]
isw_zvezda                    Sparse[float64, 0]
isw_zybinski                  Sparse[float64, 0]
isw_zyuganov                  Sparse[float64, 0]
Length: 7323, dtype: object

In [61]:
# Sort dataset by date
dataset['day_datetime'] = pd.to_datetime(dataset['day_datetime'])
dataset = dataset.sort_values(by='day_datetime')
dataset.day_datetime.describe(datetime_is_numeric=True)

count                           192867
mean     2022-08-02 23:32:38.163916288
min                2022-02-25 00:00:00
25%                2022-05-10 00:00:00
50%                2022-08-02 00:00:00
75%                2022-10-24 00:00:00
max                2023-01-20 00:00:00
Name: day_datetime, dtype: object

In [76]:
dataset['target'] = dataset['event_indicator']
dataset = dataset.drop(columns=['event_indicator'])

# check the event proportion of the data
dataset['target'].value_counts()

False    143773
True      49094
Name: target, dtype: int64

In [67]:
# set an excluded list
exclude_list = ['city_resolvedAddress', 'target']

In [68]:
min_date = dataset['day_datetime'].min()
max_date = dataset['day_datetime'].max()
print("Min:", min_date, "Max:", max_date)

Min: 2022-02-25 00:00:00 Max: 2023-01-20 00:00:00


In [69]:
train_percent = .75
time_between = max_date - min_date
train_cutoff = min_date + train_percent*time_between
train_cutoff

Timestamp('2022-10-29 18:00:00')

In [22]:
train_df = dataset[dataset.day_datetime <= train_cutoff]
test_df = dataset[dataset.day_datetime > train_cutoff]

print("Train:", train_df.day_datetime.min(), train_df.day_datetime.max())
print("Test:", test_df.day_datetime.min(), test_df.day_datetime.max())

Train: 2022-02-25 00:00:00 2022-10-29 00:00:00
Test: 2022-10-30 00:00:00 2023-01-20 00:00:00


In [70]:
days_between = time_between / np.timedelta64(1, 'D')
days_between

329.0

In [71]:
# Train logistic regression model
model = LogisticRegression()

In [26]:
# Split the dataset into features (X) and target (y)
X = dataset.drop(columns=['target'])
y = dataset['target']

# Set the duration of each test set to 2 months
test_duration = timedelta(days=60)

# Initialize the TimeSeriesSplit object
tscv = TimeSeriesSplit(n_splits=len(dataset)//test_duration.days)

    
# Loop over the splits
for train_index, test_index in tscv.split(X):
    # Get the start and end times of the test set
    test_start_time = X.iloc[test_index[0]]['day_datetime']
    test_end_time = X.iloc[test_index[-1]]['day_datetime']

    # Split the data into train and test sets based on the time duration of the test set
    train_mask = (X['day_datetime'] < test_start_time) | (X['day_datetime'] > test_end_time)
    test_mask = (X['day_datetime'] >= test_start_time) & (X['day_datetime'] <= test_end_time)
    X_train, X_test = X.loc[train_mask], X.loc[test_mask]
    y_train, y_test = y.loc[train_mask], y.loc[test_mask]

    # Train your machine learning model using X_train and y_train
    model.fit(X_train, y_train)

    # Evaluate your model on the test set
    accuracy = model.score(X_test, y_test)
    print(f"Accuracy: {accuracy}")

TypeError: object of type 'float' has no len()

In [28]:
# Split the data into train and test sets using a rolling window
window_size = 100 # set the size of each test set
n_splits = len(dataset) // window_size # calculate the number of splits we'll need

In [84]:
# Loop over the splits
for train_index, test_index in tscv.split(X):
    # Get the start and end times of the test set
    test_start_time = X.iloc[test_index[0]]['day_datetime']
    test_end_time = X.iloc[test_index[-1]]['day_datetime']

    # Split the data into train and test sets based on the time duration of the test set
    train_mask = (X['day_datetime'] < test_start_time) | (X['day_datetime'] > test_end_time)
    test_mask = (X['day_datetime'] >= test_start_time) & (X['day_datetime'] <= test_end_time)
    X_train, X_test = X.loc[train_mask], X.loc[test_mask]
    y_train, y_test = y.loc[train_mask], y.loc[test_mask]

    # Train your machine learning model using X_train and y_train
    model.fit(X_train, y_train)

    # Evaluate your model on the test set
    accuracy = model.score(X_test, y_test)
    print(f"Accuracy: {accuracy}")

  array = numpy.asarray(array, order=order, dtype=dtype)


MemoryError: Unable to allocate 10.5 GiB for an array with shape (192274, 7322) and data type float64

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

3214