In [124]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import datetime
import sklearn.ensemble
import sklearn
import nltk

In [130]:
questions = pd.read_csv("input/TrainQuestions.csv", encoding='latin1')
answers = pd.read_csv("input/TrainAnswers.csv", encoding='latin1')
tags = pd.read_csv("input/TrainTags.csv", encoding='latin1')
users = pd.read_csv("users.csv", encoding='latin1')
tag_features = pd.read_csv('tags_features.csv', encoding='latin1')
tag_ids = set(tag_features['qnid'].tolist())

In [126]:
with open('en_US.txt') as f:
    en_dict = set(f.read().splitlines())
with open('en_US2.txt') as f:
    en_dict = en_dict.union(set(f.read().splitlines()))
with open('programmingDict.txt') as f:
    en_dict = en_dict.union(set(f.read().splitlines()))
punctuation = set(['.',',',';',':','(',')','!','?','\"'])

In [127]:
def filter_code_links(soup):
    link= soup.a
    while link is not None:
        link.replace_with("link")
        link = link.code

    code = soup.code
    while code is not None:
        code.replace_with("code")
        code = soup.code
        
    return soup.getText()


In [128]:
def count_spell_errors(text):
    sentences = nltk.sent_tokenize(text)
    error_count = 0
    for s in sentences:
        if s[0].islower():
            error_count += 1
        r = ''.join([c for c in s if not c in punctuation])
        words = r.split()
        if words:
            word = words[0]
            if word not in en_dict and word.lower() not in en_dict:
                #print(word)
                error_count+=1
            for w in words[1:]:
                if w not in en_dict:
                    #print(w)
                    error_count+=1
    return error_count

In [129]:
def get_length_error(soup):
    text = filter_code_links(soup)
    error_ct = count_spell_errors(text)
    length = len(text.split())
    error_ratio = 0
    if length > 0:
        error_ratio = error_ct*1.0/length
    return length, error_ratio

In [174]:
def feature(data):
    global users
    feat = [1]
    
    #Getting Body related Features
    soup = BeautifulSoup(data['Body'],"lxml")
    numtags = len(set([tag.name for tag in soup.body.findAll(True)]))
    isimage = 1 if soup.body.findAll("img") else 0
    islink = 1 if soup.body.findAll("a") else 0
    
    code_length = 0
    allCode = soup.body.findAll("code")
    if allCode:
        iscode = 1
        for code in allCode:
            code_length += len(str(code).split())
    else:
        iscode = 0
    soup_body = BeautifulSoup(data['Body'],"lxml")
    body_length, error_ratio_body = get_length_error(soup_body)
    feat.extend([numtags, isimage, islink, iscode, code_length, body_length, error_ratio_body])
    
    #Getting title features
    soup_title = BeautifulSoup(data['Title'],"lxml")
    title_length, error_ratio_title = get_length_error(soup_title)
    feat.extend([title_length, error_ratio_title])
    
    #Getting Creation Date info
    weekday = [0] * 7
    time_of_day = [0] * 4
    year = [0] * 9
    creation_date = datetime.datetime.strptime(data['CreationDate'], "%Y-%m-%dT%H:%M:%SZ")
    weekday[creation_date.weekday()] = 1
    time_of_day[int(creation_date.hour/6)] = 1
    try:
        year[creation_date.year-2008] = 1
    except Exception as e:
        print (creation_date.year)
    feat.extend(weekday)
    feat.extend(time_of_day)
    feat.extend(year)
    
    #Getting User related Features
    badges = -1
    reputation = -1
    accept_rate = -1
    if not np.isnan(data['OwnerUserId']) and int(data['OwnerUserId']) in users['User Id'].values:
        ownerdata = users.loc[users['User Id']==int(data['OwnerUserId'])].iloc[0]
        badges = ownerdata['Bronze Badges'] + ownerdata['Silver Badges'] + ownerdata['Gold Badges']
        reputation = ownerdata['Reputation']
        if not np.isnan(ownerdata['Accept Rate']):
            accept_rate = ownerdata['Accept Rate']
    feat.extend([badges, reputation, accept_rate])
    
    #Getting Tag Related Info
    tags = [0] * 100
    avg_populatrity = 0
    max_popularity = 0
    if data['Id'] in tag_ids:
        tags = tag_features[tag_features['qnid']==data['Id']]['encoded_tags'].tolist()[0]
        if tags.startswith('['):
            tags = tags[1:]
        if tags.endswith(']'):
            tags = tags[:-1]
        tags = (int(x) for x in tags.split(','))
        avg_popularity = float(tag_features[tag_features['qnid']==data['Id']]['avg_popularity'])
        max_popularity = float(tag_features[tag_features['qnid']==data['Id']]['max_popularity'])
    feat.extend(tags)
    feat.append(avg_popularity)
    feat.append(max_popularity)
    return feat

In [175]:
answerparentids = set(answers['ParentId'].tolist())
def will_get_an_answer(data):
    return 1 if data['Id'] in answerparentids else 0

In [172]:
def time_diff_for_first_answer(data):
    if data['Id'] in answers['ParentId']:
        str_creation_dates = answers[answers['ParentId']==data['Id']]['CreationDate']
        min_answer_date = min([datetime.datetime.strptime(i, "%Y-%m-%dT%H:%M:%SZ") for i in str_creation_dates])
        #print(str_creation_dates)
        #print(min_answer_date)
        question_date = datetime.datetime.strptime(data['CreationDate'], "%Y-%m-%dT%H:%M:%SZ")
        #print(question_date)
        if min_answer_date < question_date:
            return (datetime.datetime.utcnow() - question_date).total_seconds()/3600
        else:
            return (min_answer_date - question_date).total_seconds()/3600
        #print(time_diff.total_seconds())
    else:
        question_date = datetime.datetime.strptime(data['CreationDate'], "%Y-%m-%dT%H:%M:%SZ")
        return (datetime.datetime.utcnow() - question_date).total_seconds()/3600

In [176]:
#Testing
X = feature(questions.iloc[0])
print(X,len(X))

[1, 2, 0, 0, 1, 12, 132, 0.07575757575757576, 7, 0.14285714285714285, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 36, 520, 83.0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0076523077401402205, 0.0274029567677647] 135


In [118]:
#Will you get an answer
X = [feature(questions.iloc[idx]) for idx in range(len(questions))]
print("Got features")

Got features


In [120]:
y = [will_get_an_answer(questions.iloc[idx]) for idx in range(len(questions))]
print("Got labels")

Got labels


In [121]:
X_train = X[:100000]
y_train = y[:100000]
X_val = X[100000:150000]
y_val = y[100000:150000]
X_test = X[150000:]
y_test = y[150000:]
print("Has an answer:",sum(y_train),",Doesnt have an answer:",len(y_train)-sum(y_train))

Has an answer: 87226 ,Doesnt have an answer: 12774


In [122]:
#Will you get an answer: Ada Boost Classifier
abc = sklearn.ensemble.AdaBoostClassifier(n_estimators=700, learning_rate=0.05)
print("Ada Boost Classifier")
abc.fit(X_train, y_train)
#print("Train Score:",gbc.train_score_)
#Accuracy
print("Validation Score:",abc.score(X_val, y_val))
pred_val = abc.predict(X_val)
print("F1 Validation Score:",sklearn.metrics.f1_score(y_val, pred_val))


Ada Boost Classifier
Validation Score: 0.86948
F1 Validation Score: 0.930149419874


In [58]:
print("Feature Imporatance")
feature_names=['Number of tags in body','Is image present','Is URL present','Is Code present','Length of Code',\
              '# words in body','Spell check error ratio in body','# words in title','Spell check error ratio in title', \
              'One hot encoded weekday','One hot encoded hour of day','One hot encoded year','Asker Badges','Asker Reputation', 'Asker Accept Rate']
print(abc.feature_importances_)

Features
[ 0.          0.03        0.03571429  0.00285714  0.01        0.09142857
  0.04285714  0.02285714  0.06571429  0.01857143  0.19857143  0.34571429
  0.13571429]


In [47]:
#Will you get an answer: Gradient Boosting Classifier
params = {'n_estimators': 900, 'max_depth': 6, 'min_samples_split': 3,
          'learning_rate': 0.1, 'loss': 'exponential'}
print("Gradient Boosting Classifier")
gbc = sklearn.ensemble.GradientBoostingClassifier(**params)
gbc.fit(X_train, y_train)
#print("Train Score:",gbc.train_score_)
#Accuracy
print("Validation Score:",gbc.score(X_val, y_val))
pred_val = gbc.predict(X_val)
print("F1 Validation Score:",sklearn.metrics.f1_score(y_val, pred_val))
#Gradient Boosting Classifier
#Validation Score: 0.9985
#F1 Validation Score: 0.776119402985

Gradient Boosting Classifier
Validation Score: 0.99848
F1 Validation Score: 0.775147928994


In [23]:
# #Will you get an answer: SVM Classifier
# print("SVM Linear Classifier")
# svmc = sklearn.svm.SVC(0.5, 'linear')
# svmc.fit(X_train, y_train)
# print("Validation Score:",svmc.score(X_val, y_val))
# pred_val = svmc.predict(X_val)
# print("F1 Validation Score:",sklearn.metrics.f1_score(y_val, pred_val))
# #SVM Linear Classifier
# #Validation Score: 0.99728
# #F1 Validation Score: 0.418803418803

Validation Score: 0.99728
F1 Validation Score: 0.418803418803


In [26]:
# #Time(seconds) after which you get the first answer
# y = [time_diff_for_first_answer(questions.iloc[idx]) for idx in range(len(questions))]
# print("Got labels")
# y_train = y[:100000]
# y_val = y[100000:150000]
# y_test = y[150000:]

Got labels


In [27]:
# #Time(seconds) after which you get the first answer: Gradient Boosting Regressor
# params = {'n_estimators': 900, 'max_depth': 6, 'min_samples_split': 3,
#           'learning_rate': 0.05, 'loss': 'ls'}
# print("Gradient Boosting Regressor")
# gbr = sklearn.ensemble.GradientBoostingRegressor(**params)
# gbr.fit(X_train, y_train)

# #R^2 score
# print("Validation Score:",gbr.score(X_val, y_val))
# pred_val = gbr.predict(X_val)
# print("MSE Validation:",sklearn.metrics.mean_squared_error(y_val, pred_val))
# print("MAE Validation:",sklearn.metrics.mean_absolute_error(y_val, pred_val))

# #Gradient Boosting Regressor

Gradient Boosting Regressor
Validation Score: 0.959619858873
MSE Validation: 1.48939631588e+14
MAE Validation: 7901060.68541
