In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (11.7, 11.3)
plt.rcParams.update({'font.size': 20})
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.3)})
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import time

In [2]:
Questions = pd.read_csv('FEQuestions.csv')

In [3]:
Questions.head(2)

Unnamed: 0,QID,QuestionUserId,QuestionCreateDate,QuestionScore,QuestionTitle,QuestionBody,NumAnswers,QuestionTitleAndBody,CodeText,TagFreeNonCodeText,...,HasHyperlink,HasEmbedImage,HasTried,Tags,NumTags,HasTagsInTop49,QuestionLength,QuestionCodeLength,QuestionTitleLength,QuestionPercentCode
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...,4.0,How can I find the full path to a font from it...,,How can I find the full path to a font from it...,...,False,False,False,"['python', 'osx', 'fonts', 'photoshop']",4,True,721,2,17,0.002766
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...,3.0,Get a preview JPEG of a PDF on Windows? <p>I h...,,Get a preview JPEG of a PDF on Windows? I have...,...,True,False,False,"['python', 'windows', 'image', 'pdf']",4,True,367,2,9,0.00542


In [4]:
print(Questions.isnull().sum())

QID                                  0
QuestionUserId                    6212
QuestionCreateDate                   0
QuestionScore                        0
QuestionTitle                        0
QuestionBody                         0
NumAnswers                           0
QuestionTitleAndBody                 0
CodeText                        408555
TagFreeNonCodeText                   0
CodeTextLemmatized                   0
TagFreeNonCodeTextLemmatized         0
HasError                             0
HasTraceback                         0
HasMultiLineCode                     0
HasBlockCode                         0
HasHyperlink                         0
HasEmbedImage                        0
HasTried                             0
Tags                                 0
NumTags                              0
HasTagsInTop49                       0
QuestionLength                       0
QuestionCodeLength                   0
QuestionTitleLength                  0
QuestionPercentCode      

In [5]:
Questions['QuestionUserId'].fillna(0.0, inplace=True)
Questions['QuestionCreateDate'] = pd.to_datetime(Questions['QuestionCreateDate'])

In [6]:
Y = Questions.set_index('QID')[['NumAnswers']]

In [7]:
Y.head()

Unnamed: 0_level_0,NumAnswers
QID,Unnamed: 1_level_1
469,4.0
502,3.0
535,7.0
594,3.0
683,8.0


In [30]:
Y.mode()

Unnamed: 0,NumAnswers
0,1.0


In [33]:
Y.mean()

NumAnswers    1.603453
dtype: float64

In [34]:
Y.median()

NumAnswers    1.0
dtype: float64

In [32]:
r2_score(Y.NumAnswers, np.repeat(1, Y.shape[0]))

-0.1915525288536437

In [35]:
r2_score(Y.NumAnswers, np.repeat(1.603453, Y.shape[0]))

-3.019806626980426e-14

This is the R^2 to beat.

In [8]:
X = Questions.set_index('QID').drop(['Tags', 'QuestionTitle', 'QuestionBody', 'QuestionScore', 'NumAnswers', 'QuestionTitleAndBody', 'CodeText', 'TagFreeNonCodeText', 'CodeTextLemmatized', 'TagFreeNonCodeTextLemmatized'], axis=1, inplace=False)

In [9]:
X.head()

Unnamed: 0_level_0,QuestionUserId,QuestionCreateDate,HasError,HasTraceback,HasMultiLineCode,HasBlockCode,HasHyperlink,HasEmbedImage,HasTried,NumTags,HasTagsInTop49,QuestionLength,QuestionCodeLength,QuestionTitleLength,QuestionPercentCode
QID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
469,147.0,2008-08-02 15:11:16,False,False,False,False,False,False,False,4,True,721,2,17,0.002766
502,147.0,2008-08-02 17:01:58,False,False,False,False,True,False,False,4,True,367,2,9,0.00542
535,154.0,2008-08-02 18:43:54,False,False,False,False,False,False,False,3,False,963,2,7,0.002073
594,116.0,2008-08-03 01:15:08,False,False,False,False,False,False,False,5,False,201,2,9,0.009852
683,199.0,2008-08-03 13:19:16,False,False,False,True,False,False,False,3,True,422,78,12,0.156


In [10]:
X.columns

Index(['QuestionUserId', 'QuestionCreateDate', 'HasError', 'HasTraceback',
       'HasMultiLineCode', 'HasBlockCode', 'HasHyperlink', 'HasEmbedImage',
       'HasTried', 'NumTags', 'HasTagsInTop49', 'QuestionLength',
       'QuestionCodeLength', 'QuestionTitleLength', 'QuestionPercentCode'],
      dtype='object')

Create 1-hot encoding for cyclic variables.

In [11]:
X['QuestionCreateDay'] = X['QuestionCreateDate'].dt.day

In [12]:
X['QuestionCreateMonth'] = X['QuestionCreateDate'].dt.month

In [13]:
X['QuestionCreateYear'] = X['QuestionCreateDate'].dt.year

In [14]:
X['QuestionCreateDayOfWeek'] = X['QuestionCreateDate'].dt.dayofweek

In [15]:
X['QuestionCreateHourOfDay'] = X['QuestionCreateDate'].dt.hour

In [16]:
X.columns

Index(['QuestionUserId', 'QuestionCreateDate', 'HasError', 'HasTraceback',
       'HasMultiLineCode', 'HasBlockCode', 'HasHyperlink', 'HasEmbedImage',
       'HasTried', 'NumTags', 'HasTagsInTop49', 'QuestionLength',
       'QuestionCodeLength', 'QuestionTitleLength', 'QuestionPercentCode',
       'QuestionCreateDay', 'QuestionCreateMonth', 'QuestionCreateYear',
       'QuestionCreateDayOfWeek', 'QuestionCreateHourOfDay'],
      dtype='object')

In [17]:
X_dummies = pd.get_dummies(X, columns=['QuestionCreateMonth', 'QuestionCreateDay', 'QuestionCreateDayOfWeek', 'QuestionCreateHourOfDay']).drop('QuestionCreateDate', axis=1)

In [18]:
X_dummies.head()

Unnamed: 0_level_0,QuestionUserId,HasError,HasTraceback,HasMultiLineCode,HasBlockCode,HasHyperlink,HasEmbedImage,HasTried,NumTags,HasTagsInTop49,...,QuestionCreateHourOfDay_14,QuestionCreateHourOfDay_15,QuestionCreateHourOfDay_16,QuestionCreateHourOfDay_17,QuestionCreateHourOfDay_18,QuestionCreateHourOfDay_19,QuestionCreateHourOfDay_20,QuestionCreateHourOfDay_21,QuestionCreateHourOfDay_22,QuestionCreateHourOfDay_23
QID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
469,147.0,False,False,False,False,False,False,False,4,True,...,0,1,0,0,0,0,0,0,0,0
502,147.0,False,False,False,False,True,False,False,4,True,...,0,0,0,1,0,0,0,0,0,0
535,154.0,False,False,False,False,False,False,False,3,False,...,0,0,0,0,1,0,0,0,0,0
594,116.0,False,False,False,False,False,False,False,5,False,...,0,0,0,0,0,0,0,0,0,0
683,199.0,False,False,False,True,False,False,False,3,True,...,0,0,0,0,0,0,0,0,0,0


In [19]:
def adj_r2(r2, X):
    n = X.shape[0]
    p = X.shape[1]
    return 1 - (1 - r2) * ((n-1) / (n-p-1))

In [20]:
model = LinearRegression()
model.fit(X_dummies, Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [21]:
training_accuracy_dummies = model.score(X_dummies, Y)
print("Training R^2: ", training_accuracy_dummies)
print("Training Adjusted R^2: ", adj_r2(training_accuracy_dummies, X_dummies))

Training R^2:  0.13856718073266983
Training Adjusted R^2:  0.1384409150359629


In [22]:
pd.options.display.max_rows = 999

In [23]:
coeffs = pd.DataFrame(data=[X_dummies.columns, model.coef_.squeeze()]).T
coeffs.columns = ['Feature', 'Coeff']
coeffs.sort_values('Coeff', ascending=False)

Unnamed: 0,Feature,Coeff
29,QuestionCreateDay_3,42128.3
43,QuestionCreateDay_17,42128.3
27,QuestionCreateDay_1,42128.3
38,QuestionCreateDay_12,42128.3
36,QuestionCreateDay_10,42128.3
39,QuestionCreateDay_13,42128.3
41,QuestionCreateDay_15,42128.3
37,QuestionCreateDay_11,42128.3
34,QuestionCreateDay_8,42128.3
49,QuestionCreateDay_23,42128.3


Based on these coefficients, none of the time features make much of an impact.

In [24]:
X_nodummy = Questions.set_index('QID').drop(['Tags', 'QuestionTitle', 'QuestionBody', 'QuestionScore', 'NumAnswers', 'QuestionTitleAndBody', 'CodeText', 'TagFreeNonCodeText', 'CodeTextLemmatized', 'TagFreeNonCodeTextLemmatized'], axis=1, inplace=False)

In [25]:
X_nodummy['QuestionCreateDate'] = X_nodummy['QuestionCreateDate'].apply(lambda x: (x-datetime(1970,1,1)).total_seconds())

In [26]:
model_nodummy = LinearRegression()
model_nodummy.fit(X_nodummy, Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [27]:
training_accuracy_nodummies = model_nodummy.score(X_nodummy, Y)
print("Training R^2: ", training_accuracy_nodummies)
print("Training Adjusted R^2: ", adj_r2(training_accuracy_nodummies, X_nodummy))

Training R^2:  0.138072905748014
Training Adjusted R^2:  0.13805161539681077


In [28]:
coeffs = pd.DataFrame(data=[X_nodummy.columns, model_nodummy.coef_.squeeze()]).T
coeffs.columns = ['Feature', 'Coeff']
coeffs.sort_values('Coeff', ascending=False)

Unnamed: 0,Feature,Coeff
14,QuestionPercentCode,2.90633
4,HasMultiLineCode,0.121801
10,HasTagsInTop49,0.0866519
5,HasBlockCode,0.0740014
13,QuestionTitleLength,0.011008
8,HasTried,0.00245624
0,QuestionUserId,5.90461e-08
1,QuestionCreateDate,-9.31087e-09
11,QuestionLength,-5.8752e-05
12,QuestionCodeLength,-0.00157708
