In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (11.7, 11.3)
plt.rcParams.update({'font.size': 20})
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.3)})
from datetime import datetime
from sklearn.linear_model import LinearRegression
import time

In [2]:
Questions = pd.read_csv('FEQuestions.csv')

In [3]:
Questions.head(2)

Unnamed: 0,QID,QuestionUserId,QuestionCreateDate,QuestionScore,QuestionTitle,QuestionBody,NumAnswers,QuestionTitleAndBody,CodeText,TagFreeNonCodeText,...,HasHyperlink,HasEmbedImage,HasTried,Tags,NumTags,HasTagsInTop49,QuestionLength,QuestionCodeLength,QuestionTitleLength,QuestionPercentCode
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...,4.0,How can I find the full path to a font from it...,,How can I find the full path to a font from it...,...,False,False,False,"['python', 'osx', 'fonts', 'photoshop']",4,True,721,2,17,0.002766
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...,3.0,Get a preview JPEG of a PDF on Windows? <p>I h...,,Get a preview JPEG of a PDF on Windows? I have...,...,False,False,False,"['python', 'windows', 'image', 'pdf']",4,True,367,2,9,0.00542


In [4]:
print(Questions.isnull().sum())

QID                                  0
QuestionUserId                    6212
QuestionCreateDate                   0
QuestionScore                        0
QuestionTitle                        0
QuestionBody                         0
NumAnswers                           0
QuestionTitleAndBody                 0
CodeText                        408555
TagFreeNonCodeText                   0
CodeTextLemmatized                   0
TagFreeNonCodeTextLemmatized         0
HasError                             0
HasTraceback                         0
HasMultiLineCode                     0
HasBlockCode                         0
HasHyperlink                         0
HasEmbedImage                        0
HasTried                             0
Tags                                 0
NumTags                              0
HasTagsInTop49                       0
QuestionLength                       0
QuestionCodeLength                   0
QuestionTitleLength                  0
QuestionPercentCode      

In [5]:
Questions['QuestionUserId'].fillna(0.0, inplace=True)
Questions['QuestionCreateDate'] = pd.to_datetime(Questions['QuestionCreateDate'])

In [6]:
Y = Questions.set_index('QID')[['NumAnswers']]

In [7]:
Y.head()

Unnamed: 0_level_0,NumAnswers
QID,Unnamed: 1_level_1
469,4.0
502,3.0
535,7.0
594,3.0
683,8.0


In [8]:
X = Questions.set_index('QID').drop(['Tags', 'QuestionTitle', 'QuestionBody', 'QuestionScore', 'NumAnswers', 'QuestionTitleAndBody', 'CodeText', 'TagFreeNonCodeText', 'CodeTextLemmatized', 'TagFreeNonCodeTextLemmatized'], axis=1, inplace=False)

In [9]:
X.head()

Unnamed: 0_level_0,QuestionUserId,QuestionCreateDate,HasError,HasTraceback,HasMultiLineCode,HasBlockCode,HasHyperlink,HasEmbedImage,HasTried,NumTags,HasTagsInTop49,QuestionLength,QuestionCodeLength,QuestionTitleLength,QuestionPercentCode
QID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
469,147.0,2008-08-02 15:11:16,False,False,False,False,False,False,False,4,True,721,2,17,0.002766
502,147.0,2008-08-02 17:01:58,False,False,False,False,False,False,False,4,True,367,2,9,0.00542
535,154.0,2008-08-02 18:43:54,False,False,False,False,False,False,False,3,False,963,2,7,0.002073
594,116.0,2008-08-03 01:15:08,False,False,False,False,False,False,False,5,False,201,2,9,0.009852
683,199.0,2008-08-03 13:19:16,False,False,False,True,False,False,False,3,True,422,78,12,0.156


In [10]:
X.columns

Index(['QuestionUserId', 'QuestionCreateDate', 'HasError', 'HasTraceback',
       'HasMultiLineCode', 'HasBlockCode', 'HasHyperlink', 'HasEmbedImage',
       'HasTried', 'NumTags', 'HasTagsInTop49', 'QuestionLength',
       'QuestionCodeLength', 'QuestionTitleLength', 'QuestionPercentCode'],
      dtype='object')

Create 1-hot encoding for cyclic variables.

In [11]:
X['QuestionCreateDay'] = X['QuestionCreateDate'].dt.day

In [12]:
X['QuestionCreateMonth'] = X['QuestionCreateDate'].dt.month

In [13]:
X['QuestionCreateYear'] = X['QuestionCreateDate'].dt.year

In [14]:
X['QuestionCreateDayOfWeek'] = X['QuestionCreateDate'].dt.dayofweek

In [15]:
X['QuestionCreateHourOfDay'] = X['QuestionCreateDate'].dt.hour

In [16]:
X.columns

Index(['QuestionUserId', 'QuestionCreateDate', 'HasError', 'HasTraceback',
       'HasMultiLineCode', 'HasBlockCode', 'HasHyperlink', 'HasEmbedImage',
       'HasTried', 'NumTags', 'HasTagsInTop49', 'QuestionLength',
       'QuestionCodeLength', 'QuestionTitleLength', 'QuestionPercentCode',
       'QuestionCreateDay', 'QuestionCreateMonth', 'QuestionCreateYear',
       'QuestionCreateDayOfWeek', 'QuestionCreateHourOfDay'],
      dtype='object')

In [17]:
X_dummies = pd.get_dummies(X, columns=['QuestionCreateMonth', 'QuestionCreateDay', 'QuestionCreateDayOfWeek', 'QuestionCreateHourOfDay']).drop('QuestionCreateDate', axis=1)

In [18]:
X_dummies.head()

Unnamed: 0_level_0,QuestionUserId,HasError,HasTraceback,HasMultiLineCode,HasBlockCode,HasHyperlink,HasEmbedImage,HasTried,NumTags,HasTagsInTop49,...,QuestionCreateHourOfDay_14,QuestionCreateHourOfDay_15,QuestionCreateHourOfDay_16,QuestionCreateHourOfDay_17,QuestionCreateHourOfDay_18,QuestionCreateHourOfDay_19,QuestionCreateHourOfDay_20,QuestionCreateHourOfDay_21,QuestionCreateHourOfDay_22,QuestionCreateHourOfDay_23
QID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
469,147.0,False,False,False,False,False,False,False,4,True,...,0,1,0,0,0,0,0,0,0,0
502,147.0,False,False,False,False,False,False,False,4,True,...,0,0,0,1,0,0,0,0,0,0
535,154.0,False,False,False,False,False,False,False,3,False,...,0,0,0,0,1,0,0,0,0,0
594,116.0,False,False,False,False,False,False,False,5,False,...,0,0,0,0,0,0,0,0,0,0
683,199.0,False,False,False,True,False,False,False,3,True,...,0,0,0,0,0,0,0,0,0,0


In [19]:
model = LinearRegression()
model.fit(X_dummies, Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [20]:
training_accuracy = model.score(X_dummies, Y)
print("Training R^2: ", training_accuracy)

Training R^2:  0.1380583738511627


In [21]:
pd.options.display.max_rows = 999

In [22]:
coeffs = pd.DataFrame(data=[X_dummies.columns, model.coef_.squeeze()]).T
coeffs.columns = ['Feature', 'Coeff']
coeffs.sort_values('Coeff', ascending=False)

Unnamed: 0,Feature,Coeff
6,HasEmbedImage,59734.4
15,QuestionCreateMonth_1,54017.7
16,QuestionCreateMonth_2,54017.7
17,QuestionCreateMonth_3,54017.6
18,QuestionCreateMonth_4,54017.6
19,QuestionCreateMonth_5,54017.6
20,QuestionCreateMonth_6,54017.5
21,QuestionCreateMonth_7,54017.5
22,QuestionCreateMonth_8,54017.5
23,QuestionCreateMonth_9,54017.5


Based on these coefficients, none of the time features make much of an impact.

In [23]:
X_nodummy = Questions.set_index('QID').drop(['Tags', 'QuestionTitle', 'QuestionBody', 'QuestionScore', 'NumAnswers', 'QuestionTitleAndBody', 'CodeText', 'TagFreeNonCodeText', 'CodeTextLemmatized', 'TagFreeNonCodeTextLemmatized'], axis=1, inplace=False)

In [24]:
X_nodummy['QuestionCreateDate'] = X_nodummy['QuestionCreateDate'].apply(lambda x: (x-datetime(1970,1,1)).total_seconds())

In [25]:
model_nodummy = LinearRegression()
model_nodummy.fit(X_nodummy, Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [26]:
training_accuracy = model_nodummy.score(X_nodummy, Y)
print("Training R^2: ", training_accuracy)

Training R^2:  0.13756542410345773


In [28]:
coeffs = pd.DataFrame(data=[X_nodummy.columns, model_nodummy.coef_.squeeze()]).T
coeffs.columns = ['Feature', 'Coeff']
coeffs.sort_values('Coeff', ascending=False)

Unnamed: 0,Feature,Coeff
14,QuestionPercentCode,2.99256
4,HasMultiLineCode,0.119578
10,HasTagsInTop49,0.0894647
5,HasBlockCode,0.0825569
13,QuestionTitleLength,0.0111346
0,QuestionUserId,6.08002e-08
1,QuestionCreateDate,-9.3658e-09
11,QuestionLength,-6.0991e-05
12,QuestionCodeLength,-0.00166233
8,HasTried,-0.00190886
