In [None]:
%pylab inline

In [None]:
from __future__ import print_function
from __future__ import division

In [None]:
import pandas as pd
import seaborn as sns
import pickle
import unicodedata
import time
import sklearn
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.cross_validation import train_test_split
from sklearn.multiclass import OneVsRestClassifier

In [None]:
issues = pickle.load(open("subset_issue.pkl"))
comment_text = pickle.load(open("comment_text.pkl"))

table for removing punctuation from text.

In [None]:
table = dict.fromkeys(i for i in xrange(sys.maxunicode)
                      if unicodedata.category(unichr(i)).startswith('P'))

### Clean The text

In [None]:
def get_text_components_per_issue(issues):
    text_per_issue = []
    components_per_issue = []

    for index, row in issues.iterrows():
        issue_text = ""
        for comment_id in row["comments"]:
            text = comment_text[comment_id].strip()
            # Remove punctuation
            text = text.translate(table)
            issue_text += text + " "
        text_per_issue.append(issue_text.strip())

        components_per_issue.append(set(row["components"]))
    
    return text_per_issue, components_per_issue
    

In [None]:
text_per_issue, components_per_issue = get_text_components_per_issue(issues)

### Filter out components that are used infrequently(not enough singal) or too frequently (signal not meaningful)

In [None]:
def prune_and_bin_components(components_per_issue, prune_low=0.005, prune_high=0.25):
    mlb = MultiLabelBinarizer()
    bins = mlb.fit_transform(components_per_issue)
    exclude_comp_ids = set(mlb.classes_[~(((bins.sum(axis=0) / bins.sum()) > prune_low) & 
                                        ((bins.sum(axis=0) / bins.sum()) < prune_high))])
    
    comps_per_issue_exclude = []
    for comp_set in components_per_issue:
        comps = comp_set - exclude_comp_ids
        comps_per_issue_exclude.append(comps)
    
    mlb = MultiLabelBinarizer()
    bins = mlb.fit_transform(comps_per_issue_exclude)
    return bins, mlb

In [None]:
bins, mlb = prune_and_bin_components(components_per_issue)

### Tokenize the text and perform tfidf transformations

In [None]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),
                                    token_pattern=r'\b\w+\b',
                                    min_df=5,
                                    max_df=0.5,
                                    stop_words='english')

tfidf_transformer =  TfidfTransformer()

In [None]:
counts = bigram_vectorizer.fit_transform(text_per_issue)
tfidf = tfidf_transformer.fit_transform(counts)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tfidf, bins, train_size=0.8, random_state=42)

### Train a very simple linear model

In [None]:
clf = OneVsRestClassifier(LinearSVC(C=1.0))
clf.fit(X_train, y_train)

### Predict and analyze the results

In [None]:
predictions = clf.predict(X_test)

In [None]:
(y_test == predictions).sum() / (y_test.shape[0] * y_test.shape[1])

In [None]:
np.sum((y_test == predictions).sum(axis=1) == 44) / y_test.shape[0]

In [None]:
sns.distplot(y_test.sum(axis=1), kde=False)
sns.distplot(predictions.sum(axis=1), kde=False)

In [None]:
sns.barplot(range(44), y_test.sum(axis=0), color="red")
sns.barplot(range(44), predictions.sum(axis=0), color="blue")

### Serialize the data and the model

In [None]:
def serialize_data_model(vectorizer, classifier, features, targets, transformer=None):
    current_time = int(time.time())
    pickle.dump(vectorizer, open("{}-vectorizer.pkl".format(current_time), "wb"))
    pickle.dump(classifier, open("{}-classifier.pkl".format(current_time), "wb"))
    
    training = {"features": features, "targets": targets}
    pickle.dump(training, open("{}.pkl".format(current_time), "wb"))
    
    if transformer:
        pickle.dump(transformer, open("{}-transformer.pkl".format(current_time), "wb"))
    

In [None]:
serialize_data_model(bigram_vectorizer, clf, tfidf, bins, tfidf_transformer)