# Run for results
Once you've ran the other scripts to generate the necessary csv's, adjust the file names below to point to them and run the notebook

In [None]:
import pandas as pd
import numpy as np
from os import walk

data_dir = '../data/QuestionResponseTimes/'
cache_dir = '../data/cache/'

In [None]:
posts = pd.read_csv(cache_dir+'posts.csv')
posts.head(3)

In [None]:
tag_counts = pd.read_csv('../data/TagStatistics_2017.csv')
tag_counts.head(3)

In [None]:
active_subs = pd.read_csv('../data/ActiveSubscribers_2017.csv').rename(index=str,columns={'numUsers':'numActiveSubs'})
active_subs.head(3)

In [None]:
responsive_subs = pd.read_csv('../data/ResponsiveSubscribers_2017.csv').rename(index=str,columns={'numUsers':'numResponsiveSubs'})
responsive_subs.head(3)

## Join all the tag stats

In [None]:
tag_stats_df = tag_counts.merge(responsive_subs, how='inner', left_on='Tag', right_on='TagName').merge(active_subs, how='inner', on='TagName')
del tag_stats_df['TagName']
tag_stats_df.head(3)

## Make ratios

In [None]:
tag_stats_df['ActiveSubRatio'] = tag_stats_df['numActiveSubs'] / tag_stats_df['numActiveSubs'].sum()

In [None]:
tag_stats_df['ResponsiveSubRatio'] = tag_stats_df['numResponsiveSubs'] / tag_stats_df['numResponsiveSubs'].sum()

In [None]:
tag_stats_df['PopularityRating'] = tag_stats_df['Questions'] / tag_stats_df['Questions'].sum()

In [None]:
tag_stats_df.head(3)

In [None]:
tag_dict = tag_stats_df.set_index('Tag').to_dict(orient='index')

In [None]:
# Get tag stats per question
data_df = []
for index, post_df in posts.iterrows():
    tags = [tag.strip()[1:-1] for tag in post_df['tags'][1:-1].split(',')]
    d = tag_stats_df[tag_stats_df['Tag'].isin(tags)].mean()
    d['post_id'] = str(post_df['post_id'])
    d['response_time'] = post_df['response_time']
    data_df.append(d)

In [None]:
data_df = pd.DataFrame(data_df)
data_df.head(3)

In [None]:
data_df['Tag'] = posts['tags']
data_df.head(3)

In [None]:
# Save so later can just load
data_df.to_csv('../data/cache/data_df.csv', index=False)

## Split into X and y

In [None]:
ml_columns = [
    'ActiveSubRatio',
    'ResponsiveSubRatio',
    'PopularityRating'
]

In [None]:
X = data_df[ml_columns].fillna(0).replace([np.inf, -np.inf], 0)

In [None]:
y = data_df['response_time']

In [None]:
# bin the y values
from scipy.stats import binned_statistic
binned_statistic(y, y, bins=25).bin_edges

In [None]:
y_bins = pd.cut(y, binned_statistic(y, y, bins=25).bin_edges)

In [None]:
bins = {}
for i, b in enumerate(sorted(pd.cut(y, binned_statistic(y, y, bins=25).bin_edges).unique().categories, key=lambda x:x.left)):
    bins[b] = i

In [None]:
y_binned = y_bins.apply(lambda x: bins[x]).fillna(0)

## Train and test model

In [None]:
# Import models
from sklearn.neighbors import KNeighborsClassifier

In [None]:
from sklearn.model_selection import cross_val_score
clf = KNeighborsClassifier(n_neighbors=10)
scores = cross_val_score(clf, X, y_binned, cv=10)

In [None]:
scores.mean()