In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold, KFold
from sklearn.metrics import classification_report, confusion_matrix
import joblib
from tqdm.notebook import tqdm

In [None]:
base_directory = 'khaleesi/data/'

all_encoded_features = base_directory + 'all-features-encoded.csv'
final_data_set = base_directory + 'final_dataset.csv'
final_targets = base_directory + 'final_targets.csv'

final_classifier = base_directory + 'final_classifier.joblib'

In [None]:
dataset = pd.read_csv(all_encoded_features)

In [None]:
dataset_filled = {}
targets_filled = {}
predictions = []

i = 0
while i < 30:
    dataset_filled[i] = []
    targets_filled[i] = []
    i += 1

In [None]:
pbar = tqdm(total=len(dataset), position=0, leave=True)

x = dataset.iloc[:,:101]
y = dataset.iloc[:,101:]

chain_length = 0
i = 0
while i < len(dataset):
    row = x.iloc[i]
    target = y.iloc[i]['target']

    if target == True:
        target = 1
    elif target == False:
        target = 0

    if row['?'] == 1:
        chain_length += 1
        while chain_length < 30 and i != 0:
            dataset_filled[chain_length].append([-1])
            targets_filled[chain_length].append(-1)
            chain_length += 1
        chain_length = 0
    else:
        chain_length += 1

    if chain_length > 29:
        pbar.update(1)
        i += 1
        continue

    dataset_filled[chain_length].append(row.values.tolist())
    targets_filled[chain_length].append(target)

    pbar.update(1)
    i += 1

In [None]:
classifiers = [[RandomForestClassifier(n_estimators=100, n_jobs=7, random_state=1)] * 10] * 30
dataset_filled_2d = []
targets_filled_2d = []

for key in dataset_filled:
    dataset_filled_2d.append(dataset_filled[key])
    
for key in targets_filled:
    targets_filled_2d.append(targets_filled[key])

i = 0
while i < len(classifiers):
    skf = KFold(n_splits=10, shuffle=False)
    cv_iter = skf.split(np.array(dataset_filled_2d[i]), np.array(targets_filled_2d[i]))
    
    j = 0
    for train, test in cv_iter:
        updated_dataset = []
        updated_targets = []
        for k in train:
            if len(dataset_filled_2d[i][k]) != 1:
                updated_dataset.append(dataset_filled_2d[i][k])
                updated_targets.append(targets_filled_2d[i][k])
        
        print(i,j)
        # drop the identifier 
        classifiers[i][j].fit(updated_dataset, updated_targets)
        
        for k in test:
            if list(dataset_filled_2d[i][k]) != [-1] and list(dataset_filled_2d[i+1][k]) != [-1] and i < 29:
                prediction = (classifiers[i][j].predict_proba([dataset_filled_2d[i][k]]))[0][1]
                predictions.append(prediction)
                dataset_filled_2d[i+1][k].append(prediction)
        j += 1
    i += 1

In [None]:
final_dataset = []
final_targets = []

avg_prediction = sum(predictions) / len(predictions)

i  = 0
while i < len(dataset_filled_2d):
    j = 0
    while j < len(dataset_filled_2d[i]):
        if dataset_filled_2d[i][j] != [-1]:
            if i == 0:
                dataset_filled_2d[i][j].append(avg_prediction)
                dataset_filled_2d[i][j].append(avg_prediction)
            else:
                past_predictions = []
                k = i
                while k >= 0:
                    past_predictions.append(dataset_filled_2d[k][j][-1])
                    k -= 1
                dataset_filled_2d[i][j].append((sum(past_predictions)/len(past_predictions)))
            final_dataset.append(dataset_filled_2d[i][j])
            final_targets.append(targets_filled_2d[i][j])
        j += 1
    i += 1

In [None]:
pd.DataFrame(final_dataset).to_csv(final_data_set)
pd.DataFrame(final_targets).to_csv(final_targets)

In [None]:
final_classifier = RandomForestClassifier(n_estimators=100, n_jobs=7, random_state=1)
final_classifier.fit(final_dataset, final_targets)

joblib.dump(final_classifier, final_classifier)