In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import datetime
import sklearn.ensemble
import sklearn
import nltk

In [2]:
questions = pd.read_csv("input/TrainQuestions.csv", encoding='latin1')
answers = pd.read_csv("input/TrainAnswers.csv", encoding='latin1')
tags = pd.read_csv("input/TrainTags.csv", encoding='latin1')
users = pd.read_csv("users.csv", encoding='latin1')

In [3]:
with open('en_US.txt') as f:
    en_dict = set(f.read().splitlines())
with open('en_US2.txt') as f:
    en_dict = en_dict.union(set(f.read().splitlines()))
with open('programmingDict.txt') as f:
    en_dict = en_dict.union(set(f.read().splitlines()))
punctuation = set(['.',',',';',':','(',')','!','?','\"'])

In [4]:
def filter_code_links(soup):
    link= soup.a
    while link is not None:
        link.replace_with("link")
        link = link.code

    code = soup.code
    while code is not None:
        code.replace_with("code")
        code = soup.code
        
    return soup.getText()


In [12]:
def count_spell_errors(text):
    sentences = nltk.sent_tokenize(text)
    error_count = 0
    for s in sentences:
        if s[0].islower():
            error_count += 1
        r = ''.join([c for c in s if not c in punctuation])
        words = r.split()
        if words:
            word = words[0]
            if word not in en_dict and word.lower() not in en_dict:
                #print(word)
                error_count+=1
            for w in words[1:]:
                if w not in en_dict:
                    #print(w)
                    error_count+=1
    return error_count

In [14]:
def get_length_error(soup):
    text = filter_code_links(soup)
    error_ct = count_spell_errors(text)
    length = len(text.split())
    error_ratio = 0
    if length > 0:
        error_ratio = error_ct*1.0/length
    return length, error_ratio

In [17]:
def feature(data):
    global users
    feat = [1]
    
    #Getting Body related Features
    soup = BeautifulSoup(data['Body'],"lxml")
    numtags = len(set([tag.name for tag in soup.body.findAll(True)]))
    isimage = 1 if soup.body.findAll("img") else 0
    islink = 1 if soup.body.findAll("a") else 0
    
    code_length = 0
    allCode = soup.body.findAll("code")
    if allCode:
        iscode = 1
        for code in allCode:
            code_length += len(str(code).split())
    else:
        iscode = 0
    soup_body = BeautifulSoup(data['Body'],"lxml")
    body_length, error_ratio_body = get_length_error(soup_body)
    feat.extend([numtags, isimage, islink, iscode, code_length, body_length, error_ratio_body])
    
    #Getting title features
    soup_title = BeautifulSoup(data['Title'],"lxml")
    title_length, error_ratio_title = get_length_error(soup_title)
    feat.extend([title_length, error_ratio_title])
    
    #Getting Creation Date info
    weekday = [0] * 7
    time_of_day = [0] * 4
    year = [0] * 10
    creation_date = datetime.datetime.strptime(data['CreationDate'], "%Y-%m-%dT%H:%M:%SZ")
    weekday[creation_date.weekday()] = 1
    time_of_day[creation_date.hour%4] = 1
    try:
        year[creation_date.year-2008] = 1
    except Exception as e:
        print (creation_date.year)
    feat.extend(weekday)
    feat.extend(time_of_day)
    feat.extend(year)
    
    #Getting User related Features
    badges = -1
    reputation = -1
    accept_rate = -1
    if not np.isnan(data['OwnerUserId']) and int(data['OwnerUserId']) in users['User Id'].values:
        ownerdata = users.loc[users['User Id']==int(data['OwnerUserId'])].iloc[0]
        badges = ownerdata['Bronze Badges'] + ownerdata['Silver Badges'] + ownerdata['Gold Badges']
        reputation = ownerdata['Reputation']
        if not np.isnan(ownerdata['Accept Rate']):
            accept_rate = ownerdata['Accept Rate']
    feat.extend([badges, reputation, accept_rate])
    
    return feat

In [8]:
def will_get_an_answer(data):
    return 1 if data['Id'] in answers['ParentId'] else 0

In [9]:
def time_diff_for_first_answer(data):
    if data['Id'] in answers['ParentId']:
        str_creation_dates = answers[answers['ParentId']==data['Id']]['CreationDate']
        min_answer_date = min([datetime.datetime.strptime(i, "%Y-%m-%dT%H:%M:%SZ") for i in str_creation_dates])
        #print(str_creation_dates)
        #print(min_answer_date)
        question_date = datetime.datetime.strptime(data['CreationDate'], "%Y-%m-%dT%H:%M:%SZ")
        #print(question_date)
        if min_answer_date < question_date:
            return (datetime.datetime.utcnow() - question_date).total_seconds()/3600
        else:
            return (min_answer_date - question_date).total_seconds()/3600
        #print(time_diff.total_seconds())
    else:
        question_date = datetime.datetime.strptime(data['CreationDate'], "%Y-%m-%dT%H:%M:%SZ")
        return (datetime.datetime.utcnow() - question_date).total_seconds()/3600

In [18]:
#Testing
X = feature(questions.iloc[0])
print(X)

[1, 2, 0, 0, 1, 12, 132, 0.07575757575757576, 7, 0.14285714285714285, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 36, 520, 83.0]


In [19]:
#Will you get an answer
X = [feature(questions.iloc[idx]) for idx in range(len(questions))]
print("Got features")


Got features
Got labels


In [None]:
y = [will_get_an_answer(questions.iloc[idx]) for idx in range(len(questions))]
print("Got labels")

In [20]:
X_train = X[:100000]
y_train = y[:100000]
X_val = X[100000:150000]
y_val = y[100000:150000]
X_test = X[150000:]
y_test = y[150000:]

In [21]:
#Will you get an answer: Gradient Boosting Classifier
params = {'n_estimators': 900, 'max_depth': 6, 'min_samples_split': 3,
          'learning_rate': 0.05, 'loss': 'exponential'}
print("Gradient Boosting Classifier")
gbc = sklearn.ensemble.GradientBoostingClassifier(**params)
gbc.fit(X_train, y_train)
#print("Train Score:",gbc.train_score_)
#Accuracy
print("Validation Score:",gbc.score(X_val, y_val))
pred_val = gbc.predict(X_val)
print("F1 Validation Score:",sklearn.metrics.f1_score(y_val, pred_val))
#Gradient Boosting Classifier
#Validation Score: 0.9985
#F1 Validation Score: 0.776119402985

Gradient Boosting Classifier
Validation Score: 0.9985
F1 Validation Score: 0.776119402985


In [23]:
#Will you get an answer: SVM Classifier
print("SVM Linear Classifier")
svmc = sklearn.svm.SVC(0.5, 'linear')
svmc.fit(X_train, y_train)
print("Validation Score:",svmc.score(X_val, y_val))
pred_val = svmc.predict(X_val)
print("F1 Validation Score:",sklearn.metrics.f1_score(y_val, pred_val))
#SVM Linear Classifier
#Validation Score: 0.99728
#F1 Validation Score: 0.418803418803

Validation Score: 0.99728
F1 Validation Score: 0.418803418803


In [None]:
#Time(seconds) after which you get the first answer
y = [time_diff_for_first_answer(questions.iloc[idx]) for idx in range(len(questions))]
print("Got labels")
y_train = y[:100000]
y_val = y[100000:150000]
y_test = y[150000:]

Got labels


In [None]:
#Time(seconds) after which you get the first answer: Gradient Boosting Regressor
params = {'n_estimators': 900, 'max_depth': 6, 'min_samples_split': 3,
          'learning_rate': 0.05, 'loss': 'ls'}
print("Gradient Boosting Regressor")
gbr = sklearn.ensemble.GradientBoostingRegressor(**params)
gbr.fit(X_train, y_train)

#R^2 score
print("Validation Score:",gbr.score(X_val, y_val))
pred_val = gbr.predict(X_val)
print("MSE Validation:",sklearn.metrics.mean_squared_error(y_val, pred_val))
print("MAE Validation:",sklearn.metrics.mean_absolute_error(y_val, pred_val))

#Gradient Boosting Regressor

Gradient Boosting Regressor
