# Run for results
Once you've ran the other scripts to generate the necessary csv's, adjust the file names below to point to them and run the notebook

In [None]:
import pandas as pd
import numpy as np
from os import walk

data_dir = '../data/QuestionResponseTimes/'
cache_dir = '../data/cache/'

In [None]:
posts = pd.read_csv(cache_dir+'posts.csv')
posts.head(3)

In [None]:
tag_counts = pd.read_csv('../data/TagStatistics_2017.csv')
tag_counts.head(3)

In [None]:
active_subs = pd.read_csv('../data/ActiveSubscribers_2017.csv').rename(index=str,columns={'numUsers':'numActiveSubs'})
active_subs.head(3)

In [None]:
responsive_subs = pd.read_csv('../data/ResponsiveSubscribers_2017.csv').rename(index=str,columns={'numUsers':'numResponsiveSubs'})
responsive_subs.head(3)

## Join all the tag stats

In [None]:
tag_stats_df = tag_counts.merge(responsive_subs, how='inner', left_on='Tag', right_on='TagName').merge(active_subs, how='inner', on='TagName')
del tag_stats_df['TagName']
tag_stats_df.head(3)

## Make ratios

In [None]:
tag_stats_df['ActiveSubRatio'] = tag_stats_df['numActiveSubs'] / tag_stats_df['numActiveSubs'].sum()

In [None]:
tag_stats_df['ResponsiveSubRatio'] = tag_stats_df['numResponsiveSubs'] / tag_stats_df['numResponsiveSubs'].sum()

In [None]:
tag_stats_df['PopularityRating'] = tag_stats_df['Questions'] / tag_stats_df['Questions'].sum()

In [None]:
tag_stats_df.head(3)

In [None]:
tag_dict = tag_stats_df.set_index('Tag').to_dict(orient='index')

## Get stats per question

In [None]:
cutoff = active_subs['numActiveSubs'][6500]
active_subs['numActiveSubs'] = active_subs['numActiveSubs'].astype(int)
active_subs = active_subs[active_subs['numActiveSubs'] > cutoff]
top_tags = active_subs['TagName']
# top_tags

In [None]:
top_tags = top_tags.unique()

In [None]:
# Get tag stats per question
data_df = []
for index, post_df in posts.iterrows():
    tags = [tag.strip()[1:-1] for tag in post_df['tags'][1:-1].split(',')]
    if len(list(set(tags).intersection(top_tags))) > 0:
        d = tag_stats_df[tag_stats_df['Tag'].isin(tags)].mean()
        d['post_id'] = str(post_df['post_id'])
        d['response_time'] = post_df['response_time']
        data_df.append(d)

In [None]:
data_df = pd.DataFrame(data_df)
data_df.head(3)

In [None]:
# Save so later can just load
data_df.to_csv('../data/cache/data_df_trimmed.csv', index=False)

## Split into X and y

In [None]:
ml_columns = [
    'ActiveSubRatio',
    'ResponsiveSubRatio',
    'PopularityRating',
#     'Questions',
#     'Views',
#     'percentOfSite',
#     'Score',
#     'Answers',
#     'AvgViews',
#     'AvgScore',
#     'AvgAnswers'
]

In [None]:
X = data_df[ml_columns].fillna(0).replace([np.inf, -np.inf], 0)

In [None]:
y = data_df['response_time']

In [None]:
# Bin the y values
from scipy.stats import binned_statistic
binned_statistic(y, y, bins=25).bin_edges

In [None]:
y_bins = pd.cut(y, binned_statistic(y, y, bins=25).bin_edges)

In [None]:
bins = {}
bins_inverse = {}
for i, b in enumerate(sorted(pd.cut(y, binned_statistic(y, y, bins=25).bin_edges).unique().categories, key=lambda x:x.left)):
    bins[b] = i
    bins_inverse[i] = b

In [None]:
y_binned = y_bins.apply(lambda x: bins[x]).fillna(0)

## Normalize X

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
minmax = MinMaxScaler()
X_norm = minmax.fit_transform(X)

## Train and test model

In [None]:
# Import models
from sklearn.neighbors import KNeighborsClassifier

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict
clf = KNeighborsClassifier(n_neighbors=25)
scores = cross_val_score(clf, X_norm, y_binned, cv=10)
clf = KNeighborsClassifier(n_neighbors=25)
y_pred = cross_val_predict(clf, X_norm, y_binned, cv=10)

In [None]:
scores.mean()

## Calculate the baseline

In [None]:
y_mean = y.mean()
y_mean

In [None]:
# guess the mean or median for every guess
from sklearn.metrics import accuracy_score
accuracy_score(y_binned, [6] * len(y_binned))

In [None]:
# Guess randomly
accuracy_score(y_binned, np.random.randint(0, high=25, size=len(y_binned)))

## Calculate time bin difference

In [None]:
diffs = abs(y_binned - y_pred)
baseline_diff = abs(y_binned - np.array([6] * len(y_binned)))

In [None]:
import matplotlib.pyplot as plt
plt.hist([diffs,baseline_diff], bins=range(0, 25))
plt.legend(['kNN', 'Mean Baseline'])
plt.xlabel('Time bin difference')
plt.ylabel('Frequency')
plt.savefig('../graphs/time-bin-difference.png')
plt.show()

## Calculate the relative error

In [None]:
predicted_time = []
for pred in y_pred:
    x = bins_inverse[pred]
    predicted_time.append((x.left+x.right)/2)
predicted_time = np.array(predicted_time)

In [None]:
error = []
for y_actual, pred in zip(y, predicted_time):
    v = abs(y_actual - pred) / (min([y_actual, pred]) if min([y_actual, pred]) > 0 else 1)
    error.append(v)

In [None]:
import matplotlib.pyplot as plt
plt.hist(error, bins=range(0, int(max(error)/2)))
plt.xlabel('Relative Error')
plt.ylabel('Frequency')
plt.savefig('../graphs/relative-error.png')
plt.show()

In [None]:
error = np.array(error)

In [None]:
# Mean Relative Error
error.mean()

In [None]:
# Median Relative Error
np.median(error)