In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import os
import sys


%matplotlib inline

from multilabel import multilabel_sample_dataframe, multilabel_train_test_split
from SparseInteractions import SparseInteractions
from metrics import multi_multi_log_loss

from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics.scorer import make_scorer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MaxAbsScaler

TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

In [None]:
#Load data and Explore Data

df = pd.read_csv("TrainingData.csv", index_col = 0)
print(df.shape)

In [None]:
#work on massive number of labels and resample
labels = ['Function',
          'Object_Type',
          'Operating_Status',
          'Position_Type',
          'Pre_K',
          'Reporting',
          'Sharing',
          'Student_Type',
          'Use']
non_labels = [c for c in df.columns if c not in labels]

sample_size = 40000
sampling = multilabel_sample_dataframe(df,
  pd.get_dummies(df[labels]),
  size = sample_size,
  min_count=25,
  seed=42)

dummy_labels = pd.get_dummies(sampling[labels])

In [None]:
#Train test and split the data
X_train, X_test, y_train, y_test = multilabel_train_test_split(
    sampling[non_labels], 
    dummy_labels, 
    0.2, 
    min_count = 3, 
    seed = 42)

In [None]:
#Preprocess the data
NUMERIC_COLUMNS = ['FTE', 'Total']

def combine_text_columns(data_frame, to_drop=NUMERIC_COLUMNS + labels):
    """ Takes the dataset as read in, drops the non-feature, non-text columns and
        then combines all of the text columns into a single vector that has all of
        the text for a row.
        
        :param data_frame: The data as read in with read_csv (no preprocessing necessary)
        :param to_drop (optional): Removes the numeric and label columns by default.
    """
    # drop non-text columns that are in the df
    to_drop = set(['FTE', 'Total']) & set(data_frame.columns.tolist())
    text_data = data_frame.drop(to_drop, axis=1)
    
    # replace nans with blanks
    text_data.fillna("", inplace=True)
    
    # joins all of the text items in a row (axis=1)
    # with a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

get_text_data = FunctionTransformer(combine_text_columns, validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)

In [None]:
get_text_data.fit_transform(sampling.head(5))

In [None]:
get_numeric_data.fit_transform(sampling.head(5))

In [None]:
log_loss_scorer = make_scorer(multi_multi_log_loss)

In [None]:
%%time

# set a reasonable number of features before adding interactions
chi_k = 300

# create the pipeline object
pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', SimpleImputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                     alternate_sign=False, norm=None, binary=False,
                                                     ngram_range=(1, 2))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('int', SparseInteractions(degree=2)),
        ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

# fit the pipeline to our training data
pl.fit(X_train, y_train.values)

# print the score of our trained pipeline on our test set
print("Logloss score of trained pipeline: ", log_loss_scorer(pl, X_test, y_test.values))

In [None]:
# Load holdout data
holdout = pd.read_csv('TestData.csv', index_col=0)

# Make predictions
predictions = pl.predict_proba(holdout)


# Format correctly in new DataFrame: prediction_df
prediction_df = pd.DataFrame(columns=pd.get_dummies(df[labels]).columns,
                             index=holdout.index,
                             data=predictions)
prediction_df.columns=prediction_df.columns.str.replace("_", "__", 1)


# Save prediction_df to csv called "predictions.csv"
prediction_df.to_csv("predictions.csv")