### Load data

In [1]:
from __future__ import print_function

In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from textblob import TextBlob
import re

In [3]:
#read in training data and parse dates
train = pd.read_csv('.../StackOverflow/train.csv', parse_dates=['PostCreationDate', 'OwnerCreationDate','PostClosedDate'])

In [4]:
#read in test data and parse dates
test = pd.read_csv('.../StackOverflow/test.csv', parse_dates=['PostCreationDate', 'OwnerCreationDate'])

### Define functions needed for creating new features

In [5]:
#define a funtion which creates a string of words with the word
#"title" to each word in the title
#this is so that the title text can be combined with the bodymarkdown text but will maintain
#different properties
def addtitle(titlewords):
    wordlist = ''
    for word in titlewords:
        wordcombo = 'title'+ str(word) + (' ') 
        wordlist += wordcombo
    return str(wordlist)

In [6]:
# define a function that accepts text and returns the subjectivity - this is for the bodymarkdown
# I also tested whether text polarity differed between open and closed but it did not 
def detect_subjectivity(text):
    
    # use this line for Python 2 (avoids UnicodeDecodeError for some reviews)
    blob = TextBlob(text.decode(encoding='utf-8'))
    
    # return the polarity
    return blob.sentiment.subjectivity

In [7]:
#define a function that identifies whether text (a tag) occurs in the bodymarkdown text
def findtag(text):
        if str(text).lower() in str(train.BodyMarkdown).lower():
            return 1
        else:
            return 0

### Define function to create the new features
These features were chosen after examining boxplots to determine whether the distributions appeared different between open and closed classes. Other features or variants of these features that did not seem to differ were included in some models and eliminated if they did not improve scores, etc. Some models with more features performed worse in cross-validation.

In [8]:
#define a function that makes the new features
def makefeatures(df):
    #scale the undeleted count and reputation values
    df['UndeletedCountScaled'] = preprocessing.scale(df.OwnerUndeletedAnswerCountAtPostTime)
    df['ReputationScaled'] = preprocessing.scale(df.ReputationAtPostCreation)
    
    #create a feature that measures time between author creation and post creation and scale
    df['Seniority'] = (df.PostCreationDate - df.OwnerCreationDate).astype('timedelta64[D]')
    df['SeniorityScaled'] = preprocessing.scale(df.Seniority)
    
    #create features for length of the Title and length of the Body Markdown and scale
    df['TitleLengthScaled'] = preprocessing.scale(df.Title.apply(len))
    df['BodyLengthScaled'] = preprocessing.scale(df.BodyMarkdown.apply(len))
    
    #create a feature with the number of tags for the question and scale
    df['TagCount'] = df.loc[:,['Tag1','Tag2','Tag3','Tag4','Tag5']].count(axis = 1)
    df['TagCountScaled'] = preprocessing.scale(df.loc[:,['Tag1','Tag2','Tag3','Tag4','Tag5']].count(axis = 1))
    
    #create a feature for subjectivity of the BodyMarkdown and the Title
    df['Subjectivity'] = df.BodyMarkdown.apply(detect_subjectivity)
    df['TitleSubjectivity'] = df.Title.apply(detect_subjectivity)
    
    #create a feature for whether or not a "question word" is present in the Title
    df['QuestionWord'] = df.Title.str.contains(r'how|what|when|why|does|will|would|is it', flags=re.IGNORECASE)
    df['QuestionWordInt'] = df.QuestionWord.astype(int)
    
    #create a feature for whether the *second* tag is in the BodyMarkdown - distribution of 
    #first tag did not differ across classes, but second tag did
    df['Tag2Text'] = df['Tag2'].apply(findtag)
    
    #transform text labels to numerical values for Tag1 label and weekday of post creation
    lb = LabelEncoder()
    df['Tag1Label']= lb.fit_transform(df.Tag1)
    df['Weekday'] = lb.fit_transform(df.PostCreationDate.dt.weekday_name)
    
    #create a column which is a list of words in the title for combination with word "title"
    df['TitleText'] = df.Title.str.split()
    #create a column which is the words of the title each combined with the word "title"
    df['TitleCombo'] = df['TitleText'].apply(addtitle)
    
    #create a column which includes the text from the Title, BodyMarkdown, and the tags in one
    # the tags are preceded by the string "tag" to separate them from bodytext for the vectorizer
    df['TextCombo'] = df.TitleCombo +('\s')+ df.BodyMarkdown +('\s')+ 'Tag1'+ df['Tag1'].fillna('') + ('\s')+ 'Tag2'+ df['Tag2'].fillna('') +('\s')+ 'Tag3'+ df['Tag3'].fillna('') 
    
    return df

### Add new features to training and test datasets

In [9]:
#add new features to the training data
makefeatures(train)
train.head()

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))


Unnamed: 0.1,Unnamed: 0,PostId,PostCreationDate,OwnerUserId,OwnerCreationDate,ReputationAtPostCreation,OwnerUndeletedAnswerCountAtPostTime,Title,BodyMarkdown,Tag1,...,Subjectivity,TitleSubjectivity,QuestionWord,QuestionWordInt,Tag2Text,Tag1Label,Weekday,TitleText,TitleCombo,TextCombo
0,0,6046168,2011-05-18 14:14:05,543315,2010-09-17 10:15:06,1,2,For Mongodb is it better to reference an objec...,I am building a corpus of indexed sentences in...,mongodb,...,0.661905,0.633333,True,1,1,2776,6,"[For, Mongodb, is, it, better, to, reference, ...",titleFor titleMongodb titleis titleit titlebet...,titleFor titleMongodb titleis titleit titlebet...
1,1,4873911,2011-02-02 11:30:10,465076,2010-10-03 09:30:58,192,24,How to insert schemalocation in a xml document...,i create a xml document with JAXP and search a...,dom,...,0.369697,0.0,True,1,0,1199,6,"[How, to, insert, schemalocation, in, a, xml, ...",titleHow titleto titleinsert titleschemalocati...,titleHow titleto titleinsert titleschemalocati...
2,2,3311559,2010-07-22 17:21:54,406143,2010-07-22 16:58:20,1,0,Too many lookup tables,What are the adverse effects of having too man...,sql-server,...,0.5,0.5,False,0,0,4227,4,"[Too, many, lookup, tables]",titleToo titlemany titlelookup titletables,titleToo titlemany titlelookup titletables \sW...
3,3,9990413,2012-04-03 09:18:39,851755,2011-07-19 10:22:40,4,1,What is this PHP code in VB.net,I am looking for the vb.net equivalent of this...,php,...,0.525,0.0,True,1,1,3339,5,"[What, is, this, PHP, code, in, VB.net]",titleWhat titleis titlethis titlePHP titlecode...,titleWhat titleis titlethis titlePHP titlecode...
4,4,10421966,2012-05-02 21:25:01,603588,2011-02-04 18:05:34,334,14,Spring-Data mongodb querying multiple classes ...,"With Spring-Data, you can use the @Document an...",mongodb,...,0.7,0.0625,False,0,1,2776,6,"[Spring-Data, mongodb, querying, multiple, cla...",titleSpring-Data titlemongodb titlequerying ti...,titleSpring-Data titlemongodb titlequerying ti...


In [32]:
#add new features to the test data
makefeatures(test)
test.head()



Unnamed: 0,PostId,PostCreationDate,OwnerUserId,OwnerCreationDate,ReputationAtPostCreation,OwnerUndeletedAnswerCountAtPostTime,Title,BodyMarkdown,Tag1,Tag2,...,TagCountScaled,Subjectivity,TitleSubjectivity,QuestionWord,QuestionWordInt,Tag1Label,Weekday,TitleText,TitleCombo,TextCombo
0,11768878,2012-08-01 23:10:12,756422,2011-05-16 21:49:59,155,11,Maven & yui-compressor Plugin issues,I'm using the yui-compressor plugin for maven ...,maven,maven-3,...,0.087312,0.531818,0.0,False,0,1463,6,"[Maven, &, yui-compressor, Plugin, issues]",titleMaven title& titleyui-compressor titlePlu...,titleMaven title& titleyui-compressor titlePlu...
1,11768880,2012-08-01 23:10:21,1569892,2012-08-01 22:24:37,1,0,Inconsistent behaviour of html select dropdowns,I have written a javascript-generated web page...,html,select,...,0.895384,0.263043,0.0,False,0,1071,6,"[Inconsistent, behaviour, of, html, select, dr...",titleInconsistent titlebehaviour titleof title...,titleInconsistent titlebehaviour titleof title...
2,11803678,2012-08-03 21:40:49,1301879,2012-03-29 21:01:29,781,37,Why Does MSFT C# Compiler Compile fixed Statem...,The .NET c# compiler (.NET 4.0) compiles the `...,c#,.net,...,0.895384,0.38947,0.2,True,1,305,0,"[Why, Does, MSFT, C#, Compiler, Compile, fixed...",titleWhy titleDoes titleMSFT titleC# titleComp...,titleWhy titleDoes titleMSFT titleC# titleComp...
3,11803496,2012-08-03 21:24:02,1196150,2012-02-08 02:20:44,538,0,Dump sql file to ClearDB in Heroku,I have a sql file that I want to be dumped int...,mysql,ruby-on-rails,...,0.087312,0.2,0.0,False,0,1583,0,"[Dump, sql, file, to, ClearDB, in, Heroku]",titleDump titlesql titlefile titleto titleClea...,titleDump titlesql titlefile titleto titleClea...
4,11803700,2012-08-03 21:43:13,772581,2009-11-13 16:24:05,70,2,mysql query to get rows with conditions,"\r\nI have a table called ""articles"" on the da...",mysql,query,...,-0.720759,0.55,0.0,False,0,1583,0,"[mysql, query, to, get, rows, with, conditions]",titlemysql titlequery titleto titleget titlero...,titlemysql titlequery titleto titleget titlero...


In [25]:
# define a function that accepts a DataFrame and returns the manually created features
def get_manual(df):
    return df.loc[:, ['UndeletedCountScaled', 'ReputationScaled','SeniorityScaled','BodyLengthScaled','TitleLengthScaled','TagCountScaled','Tag2Text','TitleSubjectivity','Subjectivity','QuestionWordInt']]

In [26]:
from sklearn.preprocessing import FunctionTransformer

In [27]:
# create a stateless transformer from the get_manual function
get_manual_ft = FunctionTransformer(get_manual, validate=False)
type(get_manual_ft)

sklearn.preprocessing._function_transformer.FunctionTransformer

### Instantiate logistic regression model, define X and y, get cross_val score
I used logistic regression since the evaluation metric was going to be log loss. 
Without using class weights, my Public Leaderboard score got down to about 0.35. However, I realized that as my scores improved, the probabilities I was submitting were generally higher. This lead me to believe that many of the StackOverflow questions in the test set were open (Class 1) - more so than in the training data (which was 50-50 split). This was also indicated by the information from StackOverflow that only about 6% of their questions get closed. With the potentional for such an unbalanced class distribution, it made sense to try weighting the classes differently in predicting probabilities. This did lead to higher cross_val scores in my internal checks since cross validation was being conducted on the 50-50 training data...So this could in theory be risky (for going from public to private leaderboard), except that the test set was so large and the pattern in the probabilities was so consistent across it.

In [100]:
#instantiate logistic regression model with a class weight of 6 for Class 1, the open class
#a class weight of 9 lead to slight improvement on the public leaderboard (0.15), but in the interest of being 
#slightly more conservative for the public leaderboard I submitted 6 to 1 - in case the distribution was not as skewed
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(class_weight={1:6})

In [29]:
#define X and y
X = get_manual_ft.transform(train)
y = train.OpenStatus

In [30]:
X.head(5)

Unnamed: 0,UndeletedCountScaled,ReputationScaled,SeniorityScaled,BodyLengthScaled,TitleLengthScaled,TagCountScaled,Tag2Text,TitleSubjectivity,Subjectivity,QuestionWordInt
0,-0.193828,-0.202586,0.101018,-0.221456,1.364503,-1.380128,1,0.633333,0.661905,1
1,0.05029,-0.126061,-0.346755,0.085657,0.285825,0.204326,0,0.0,0.369697,1
2,-0.216021,-0.202586,-0.798228,-0.437045,-1.23413,0.204326,0,0.5,0.5,0
3,-0.204925,-0.201384,0.156528,0.059895,-0.841884,-0.587901,1,0.0,0.525,1
4,-0.060673,-0.069167,0.878145,-0.149592,1.315472,-0.587901,1,0.0625,0.7,0


In [64]:
# evaluate with 5-fold cross-validation
# this is the cross_val score I got without weighting - so the most representative score for 
# the prediction power of the features alone
from sklearn.cross_validation import cross_val_score
cross_val_score(logreg, X, y, cv=5, scoring='log_loss').mean()

-0.66639946757234036

In [39]:
# the Tag 2 in text feature was not included in the test data - adding it here
def findtagtest(text):
        if str(text).lower() in str(test.BodyMarkdown).lower():
            return 1
        else:
            return 0

test['Tag2Text'] = test['Tag2'].apply(findtagtest)

In [41]:
#transform test set as the training set was transoformed
testset = get_manual_ft.transform(test)
testset.head()

Unnamed: 0,UndeletedCountScaled,ReputationScaled,SeniorityScaled,BodyLengthScaled,TitleLengthScaled,TagCountScaled,Tag2Text,TitleSubjectivity,Subjectivity,QuestionWordInt
0,-0.081026,-0.122563,0.364912,1.289627,-0.723031,0.087312,0,0.0,0.531818,0
1,-0.172551,-0.165595,-0.901078,0.588896,-0.171834,0.895384,1,0.0,0.263043,0
2,0.135307,0.05236,-0.538142,5.53238,0.429471,0.895384,1,0.2,0.38947,1
3,-0.172551,-0.015542,-0.395254,-0.334642,-0.823248,0.087312,0,0.0,0.2,0
4,-0.15591,-0.146315,1.939541,0.038947,-0.572704,-0.720759,1,0.0,0.55,0


### Fit the logistic regression model for the feature set and predict probabilities for the test data based on these features alone
I used ensembling with the feature data and the text/vectorizer data separately. I compared the ensembling procedure below (with features and vectorizer data separately), with the results of a model using a FeatureUnion pipeline and predicting probabilites using the single pipeline. The results of the two methods were about the same, when I just averaged the feature probabilities with the vectorizer probabilities in the ensembling procedure. However, the features set overall was not adding a lot to the model (cross_val scores were only around .66), so ensembling and _weighting_ the vectorizer features more heavily than the engineered features lead to better public Leaderboard scores (around 0.35). The features _do_ add something though - with only the vectorized text data my score was around 0.50.

In [101]:
logreg.fit(X,y)

LogisticRegression(C=1.0, class_weight={1: 9}, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [102]:
new_feat_pred = logreg.predict_proba(testset)
new_feat_pred

array([[ 0.05484463,  0.94515537],
       [ 0.09893664,  0.90106336],
       [ 0.01836226,  0.98163774],
       ..., 
       [ 0.10720657,  0.89279343],
       [ 0.1365372 ,  0.8634628 ],
       [ 0.17502557,  0.82497443]])

### Use TF-IDF vectorizer on the text data and fit model
I tested different parameters in the vectorizer and the only features that lead to improvement in cross_val scores were the ngram range and stop words definitions.

In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(ngram_range = (1,3), stop_words='english')

In [69]:
vect

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [103]:
#for text data only
textpipe = make_pipeline(vect, logreg)

In [72]:
#run the vectorizer on the created column which includes the title info and the tags
textdata = train.TextCombo
textdata[1:5]

1    titleHow titleto titleinsert titleschemalocati...
2    titleToo titlemany titlelookup titletables \sW...
3    titleWhat titleis titlethis titlePHP titlecode...
4    titleSpring-Data titlemongodb titlequerying ti...
Name: TextCombo, dtype: object

In [None]:
#score was about -0.5065
cross_val_score(textpipe, textdata, y, cv=5, scoring='log_loss').mean()

In [104]:
textpipe.fit(textdata, y)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smoo...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

### Predict probabilites for the test data
This is slightly odd, but I used only the BodyMarkdown text for the test set, despite having trained on the TextCombo (body, title, tags) in the training set - this lead to better scores than when I used the combo in the test data. The explanation may lie in the generalizability of the Body Markdown text from the training to the test set...The TFIDF values for the training data were affected by the Title and the Tags, and thus may have not been _as_ influenced by the body itself. This may have helped generalization to the body markdown in the test set, which was not identical to training of course. 

In [74]:
#select text data for test probabilities
testtext = test.BodyMarkdown

In [105]:
#predict class probabilities for the test set
new_text_pred = textpipe.predict_proba(testtext)
new_text_pred

array([[ 0.01503589,  0.98496411],
       [ 0.00459657,  0.99540343],
       [ 0.00202306,  0.99797694],
       ..., 
       [ 0.02111382,  0.97888618],
       [ 0.25484584,  0.74515416],
       [ 0.05216519,  0.94783481]])

In [106]:
#get probabilites of class one from the feature data and the vectorizer
textpred = new_text_pred[:,1]
featpred = new_feat_pred[:,1]

In [107]:
# calculate the predicted probabilities for all rows by weighting the text data vectorizer features more heavily
# tried weightings from x2 to x12. x11 lead to the best performance on the public leaderboard
test['logweight2'] = pd.DataFrame((textpred*11 + featpred) / 12)
test.head(10)

Unnamed: 0,PostId,PostCreationDate,OwnerUserId,OwnerCreationDate,ReputationAtPostCreation,OwnerUndeletedAnswerCountAtPostTime,Title,BodyMarkdown,Tag1,Tag2,...,Tag1Label,Weekday,TitleText,TitleCombo,TextCombo,Tag2Text,heavyweightprob3,logweight,logweight2,logweight3
0,11768878,2012-08-01 23:10:12,756422,2011-05-16 21:49:59,155,11,Maven & yui-compressor Plugin issues,I'm using the yui-compressor plugin for maven ...,maven,maven-3,...,1463,6,"[Maven, &, yui-compressor, Plugin, issues]",titleMaven title& titleyui-compressor titlePlu...,titleMaven title& titleyui-compressor titlePlu...,0,0.844077,0.944456,0.972307,0.981647
1,11768880,2012-08-01 23:10:21,1569892,2012-08-01 22:24:37,1,0,Inconsistent behaviour of html select dropdowns,I have written a javascript-generated web page...,html,select,...,1071,6,"[Inconsistent, behaviour, of, html, select, dr...",titleInconsistent titlebehaviour titleof title...,titleInconsistent titlebehaviour titleof title...,1,0.904994,0.963014,0.981194,0.987542
2,11803678,2012-08-03 21:40:49,1301879,2012-03-29 21:01:29,781,37,Why Does MSFT C# Compiler Compile fixed Statem...,The .NET c# compiler (.NET 4.0) compiles the `...,c#,.net,...,305,0,"[Why, Does, MSFT, C#, Compiler, Compile, fixed...",titleWhy titleDoes titleMSFT titleC# titleComp...,titleWhy titleDoes titleMSFT titleC# titleComp...,1,0.94206,0.985284,0.994176,0.996615
3,11803496,2012-08-03 21:24:02,1196150,2012-02-08 02:20:44,538,0,Dump sql file to ClearDB in Heroku,I have a sql file that I want to be dumped int...,mysql,ruby-on-rails,...,1583,0,"[Dump, sql, file, to, ClearDB, in, Heroku]",titleDump titlesql titlefile titleto titleClea...,titleDump titlesql titlefile titleto titleClea...,0,0.502241,0.707374,0.805444,0.851012
4,11803700,2012-08-03 21:43:13,772581,2009-11-13 16:24:05,70,2,mysql query to get rows with conditions,"\r\nI have a table called ""articles"" on the da...",mysql,query,...,1583,0,"[mysql, query, to, get, rows, with, conditions]",titlemysql titlequery titleto titleget titlero...,titlemysql titlequery titleto titleget titlero...,1,0.728354,0.884388,0.936322,0.955811
5,11927241,2012-08-13 01:29:24,747871,2011-05-11 01:38:46,176,0,Replace all '-' in a url,my code now only replaces the first `-` with s...,jquery,,...,1278,1,"[Replace, all, '-', in, a, url]",titleReplace titleall title'-' titlein titlea ...,titleReplace titleall title'-' titlein titlea ...,0,0.660596,0.831422,0.900151,0.92839
6,11927226,2012-08-13 01:26:26,1019215,2011-10-28 22:52:21,2657,161,"IIS Permissions: 'Everyone' works, but IIS Def...",I got an error stating that the Site.master co...,asp.net,permissions,...,157,1,"[IIS, Permissions:, 'Everyone', works,, but, I...",titleIIS titlePermissions: title'Everyone' tit...,titleIIS titlePermissions: title'Everyone' tit...,0,0.829004,0.931079,0.963323,0.975037
7,11927247,2012-08-13 01:31:03,1402540,2012-05-18 05:18:06,11,0,iCloud Save data With UIDocument crash,I am trying to use iCloud to store my app's us...,ios,icloud,...,1182,1,"[iCloud, Save, data, With, UIDocument, crash]",titleiCloud titleSave titledata titleWith titl...,titleiCloud titleSave titledata titleWith titl...,1,0.907899,0.9687,0.985409,0.990763
8,11927248,2012-08-13 01:31:05,492015,2010-10-30 07:49:39,202,4,Java thread stops with no Exception,When I use 4 threads for my program there is u...,java,multithreading,...,1217,1,"[Java, thread, stops, with, no, Exception]",titleJava titlethread titlestops titlewith tit...,titleJava titlethread titlestops titlewith tit...,0,0.521534,0.712785,0.803684,0.846579
9,11927254,2012-08-13 01:31:45,1367229,2012-05-01 05:47:07,70,5,Inserting images into a PDF using JODConverter,Basically I'm scanning in a whole bunch of doc...,java,netbeans,...,1217,1,"[Inserting, images, into, a, PDF, using, JODCo...",titleInserting titleimages titleinto titlea ti...,titleInserting titleimages titleinto titlea ti...,1,0.79166,0.914257,0.953906,0.968524


In [108]:
pd.DataFrame({'id':test.PostId, 'OpenStatus':test.logweight3}).set_index('id').to_csv('sub26_logweight.csv')