In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm
import joblib
import utilities

In [None]:
base_directory = 'khaleesi/data/'

final_data_set = base_directory + 'final_dataset.csv'
final_targets = base_directory + 'final_targets.csv'

predictions_dir = base_directory + 'detections.csv'

json_representation_http_dir = base_directory + 'crawl-http-labeled.json'
json_representation_js_dir = base_directory + 'crawl-js-connected-labeled.json'

In [None]:
dataset = pd.read_csv(final_data_set)
ground_truth = pd.read_csv(final_targets)

In [None]:
chain_ids = []
redirect_ids = []
features = []
targets = []

pbar = tqdm(total=len(dataset), position=0, leave=True)

for i in range(len(dataset)):
    pbar.update(1)

    identifier = dataset.iloc[i][1]
    identifier_split = identifier.split('|')
    chain_id = '|'.join(identifier_split[:-1])
    redirect_id = identifier_split[-1]
    row = dataset.iloc[i].values[2:]
    target = ground_truth.iloc[i].values[1]

    chain_ids.append(chain_id)
    redirect_ids.append(redirect_id)
    features.append(row)
    targets.append(target)

In [None]:
def cross_validation(clf, x, y, folds=10):
    all_predictions = [None] * len(x)
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    cv_iter = skf.split(x, y)
    
    count = 1
    for train, test in cv_iter:
        print('Iter', count)
        count += 1
        clf.fit(x[train], y[train])
        predictions = clf.predict(x[test])
        for i in range(len(predictions)):
            all_predictions[test[i]] = predictions[i]
            
    return all_predictions

In [None]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=7, random_state=3)
all_predictions = cross_validation(clf, np.array(features), np.array(targets))

In [None]:
results_dict = {}
results_earliest = set()

for idx, item in enumerate(all_predictions):
    if item == 0:
        continue
    
    chain_id = chain_ids[idx]
    current_redirect_id = redirect_ids[idx]
    
    if chain_id.startswith('H|'):
        if current_redirect_id == '0':
            continue
        if chain_id in results_dict:
            previous_redirect_id = results_dict[chain_id]
            if int(current_redirect_id) < int(previous_redirect_id):
                results_dict[chain_id] = current_redirect_id

        else:
            results_dict[chain_id] = current_redirect_id
    else:
        results_earliest.add(chain_id + '|' + current_redirect_id)
            
for item in results_dict:
    results_earliest.add(item + '|' + results_dict[item])

In [None]:
utilities.write_list_simple(predictions_dir, results_earliest)