# Import Libs

In [1]:
from __future__ import division
from __future__ import print_function
%matplotlib inline

# ignore deprecation warnings in sklearn
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import os
import sys

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

from data.multilabel import multilabel_sample_dataframe, multilabel_train_test_split
from features.SparseInteractions import SparseInteractions
from models.metrics import multi_multi_log_loss


# Load Data and Display Shape

In [2]:
path_to_training_data = os.path.join(os.pardir, 'data', 'TrainingSet.csv')
df = pd.read_csv(path_to_training_data, index_col=0)
print(df.shape)

(400277, 25)


# Set Display Max Columns to 500

In [3]:
pd.set_option('display.max_columns', 500)

# Display First 5 Rows of DF

In [4]:
df.head()

Unnamed: 0,Function,Use,Sharing,Reporting,Student_Type,Position_Type,Object_Type,Pre_K,Operating_Status,Object_Description,Text_2,SubFund_Description,Job_Title_Description,Text_3,Text_4,Sub_Object_Description,Location_Description,FTE,Function_Description,Facility_or_Department,Position_Extra,Total,Program_Description,Fund_Description,Text_1
134338,Teacher Compensation,Instruction,School Reported,School,NO_LABEL,Teacher,NO_LABEL,NO_LABEL,PreK-12 Operating,,,,Teacher-Elementary,,,,,1.0,,,KINDERGARTEN,50471.81,KINDERGARTEN,General Fund,
206341,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,Non-Operating,CONTRACTOR SERVICES,BOND EXPENDITURES,BUILDING FUND,(blank),Regular,,,,,RGN GOB,,UNDESIGNATED,3477.86,BUILDING IMPROVEMENT SERVICES,,BUILDING IMPROVEMENT SERVICES
326408,Teacher Compensation,Instruction,School Reported,School,Unspecified,Teacher,Base Salary/Compensation,Non PreK,PreK-12 Operating,Personal Services - Teachers,,,TCHER 2ND GRADE,,Regular Instruction,,,1.0,,,TEACHER,62237.13,Instruction - Regular,General Purpose School,
364634,Substitute Compensation,Instruction,School Reported,School,Unspecified,Substitute,Benefits,NO_LABEL,PreK-12 Operating,EMPLOYEE BENEFITS,TEACHER SUBS,GENERAL FUND,"Teacher, Short Term Sub",Regular,,,,,UNALLOC BUDGETS/SCHOOLS,,PROFESSIONAL-INSTRUCTIONAL,22.3,GENERAL MIDDLE/JUNIOR HIGH SCH,,REGULAR INSTRUCTION
47683,Substitute Compensation,Instruction,School Reported,School,Unspecified,Teacher,Substitute Compensation,NO_LABEL,PreK-12 Operating,TEACHER COVERAGE FOR TEACHER,TEACHER SUBS,GENERAL FUND,"Teacher, Secondary (High)",Alternative,,,,,NON-PROJECT,,PROFESSIONAL-INSTRUCTIONAL,54.166,GENERAL HIGH SCHOOL EDUCATION,,REGULAR INSTRUCTION


# Display Last 5 Rows of DF

In [5]:
df.tail()

Unnamed: 0,Function,Use,Sharing,Reporting,Student_Type,Position_Type,Object_Type,Pre_K,Operating_Status,Object_Description,Text_2,SubFund_Description,Job_Title_Description,Text_3,Text_4,Sub_Object_Description,Location_Description,FTE,Function_Description,Facility_or_Department,Position_Extra,Total,Program_Description,Fund_Description,Text_1
109283,Professional Development,ISPD,Shared Services,Non-School,Unspecified,Instructional Coach,Other Compensation/Stipend,NO_LABEL,PreK-12 Operating,WORKSHOP PARTICIPANT,,,CURRICULUM RESOURCE TEACHER,,,,STAFF DEV AND INSTR MEDIA,,INST STAFF TRAINING SVCS,,,48.62,,GENERAL FUND,STAFF DEV AND INSTR MEDIA
102430,Substitute Compensation,Instruction,School Reported,School,Unspecified,Substitute,Base Salary/Compensation,NO_LABEL,PreK-12 Operating,SALARIES OF PART TIME EMPLOYEE,,FEDERAL GDPG FUND - FY,"Teacher,Retrd Shrt Term Sub",Regular,,,,0.00431,"TITLE II,D",,PROFESSIONAL-INSTRUCTIONAL,128.824985,INSTRUCTIONAL STAFF TRAINING,,INSTRUCTIONAL STAFF
413949,Parent & Community Relations,NO_LABEL,School Reported,School,NO_LABEL,Other,NO_LABEL,NO_LABEL,PreK-12 Operating,,,,School Liaison,,,,,1.0,,,PARENT/TITLE I,4902.29,Misc,Schoolwide Schools,
433672,Library & Media,Instruction,School on Central Budgets,Non-School,Unspecified,Librarian,Benefits,NO_LABEL,PreK-12 Operating,EMPLOYEE BENEFITS,EDUCATIONAL RESOURCE SERVICES,LEVY OVERRIDE,Library Technician II,,,,ED RESOURCE SERVICES,,NON-PROJECT,,OFFICE/ADMINISTRATIVE SUPPORT,4020.29,MEDIA SUPPORT SERVICES,,INSTRUCTIONAL STAFF
415831,Substitute Compensation,Instruction,School Reported,School,Poverty,Substitute,Substitute Compensation,Non PreK,PreK-12 Operating,Salaries And Wages For Substitute Professionals,,"""Title Part A Improving Basic Programs""",TEACHER SUBSTITUTE POOL,,Multilingual Dist Prof Development,Inservice Substitute Teachers Grant Funded,School,,Instruction,Instruction And Curriculum,CERTIFIED SUBSTITUTE,46.53,Accelerated Education,"""Title Part A Improving Basic Programs""",MISCELLANEOUS


# Separate Training and Test Data

In [6]:
LABELS = ['Function',
          'Use',
          'Sharing',
          'Reporting',
          'Student_Type',
          'Position_Type',
          'Object_Type', 
          'Pre_K',
          'Operating_Status']

NON_LABELS = [c for c in df.columns if c not in LABELS]

dummy_labels = pd.get_dummies(df[LABELS])

X_train, X_test, y_train, y_test = multilabel_train_test_split(df[NON_LABELS], 
                                                               dummy_labels,
                                                               0.02,
                                                               min_count=3,
                                                               seed=43)

# Combine Text Columns

In [7]:
NUMERIC_COLUMNS = ['FTE', "Total"]

def combine_text_columns(df, to_drop=NUMERIC_COLUMNS + LABELS):
    '''
        Takes the dataset as read in,
        drops the non-feature, non-text columns
        and then combine all of the text columns
        into a single vector that has all of the text for a row
        
        :param df: The data frame
        :param to_drop (optional): Removes the numeric and label columns by default
    '''
    # drop non-text columns that are in the df
    to_drop = set(to_drop) & set(df.columns.tolist())
    text_data = df.drop(to_drop, axis=1)
    
    # replace nans with blanks
    text_data.fillna('', inplace=True)
    
    # joins all of the text items in a row (axis=1)
    return text_data.apply(lambda x: ' '.join(x), axis=1)

# Define Get Text and Numeric Data Functions

In [8]:
from sklearn.preprocessing import FunctionTransformer

get_text_data = FunctionTransformer(combine_text_columns, validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)

# Get Text First 5 Rows

In [9]:
get_text_data.fit_transform(df.head())

134338       Teacher-Elementary        KINDERGARTEN  KIN...
206341    CONTRACTOR SERVICES BOND EXPENDITURES BUILDING...
326408    Personal Services - Teachers   TCHER 2ND GRADE...
364634    EMPLOYEE BENEFITS TEACHER SUBS GENERAL FUND Te...
47683     TEACHER COVERAGE FOR TEACHER TEACHER SUBS GENE...
dtype: object

# Get Numeric First 5 Rows

In [10]:
get_numeric_data.fit_transform(df.head())

Unnamed: 0,FTE,Total
134338,1.0,50471.81
206341,,3477.86
326408,1.0,62237.13
364634,,22.3
47683,,54.166


# Make Multi-Multi Log Loss Scorer

In [11]:
from sklearn.metrics.scorer import make_scorer

log_loss_scorer = make_scorer(multi_multi_log_loss)

# Import Sklearn Libs

In [12]:
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MaxAbsScaler

# Alphanumeric Tokens

In [13]:
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# Number of Features Before Adding Interactions 

In [14]:
chi_k = 300

# Pipeline

In [15]:
pl = Pipeline([
    ('union', 
         FeatureUnion(
             transformer_list = [
                 ('numeric_features', 
                      Pipeline([
                          ('selector', get_numeric_data),
                          ('imputer', SimpleImputer())
                      ])
                 ),
                 ('text_features', Pipeline([
                     ('selector', get_text_data),
                     ('vectorizer', 
                          CountVectorizer(
                              token_pattern=TOKENS_ALPHANUMERIC,
                              binary=False,
                              ngram_range=(1, 2)
                          )
                     ),
                     ('dim_red', SelectKBest(chi2, chi_k))
                 ])
                 )
             ]
         )
    ),
    ('interaction', SparseInteractions(degree=2)),
    ('scale', MaxAbsScaler()),
    ('classifier', OneVsRestClassifier(LogisticRegression()))
])

# Fit the Pipeline to Training Data

In [None]:
%%time
pl.fit(X_train, y_train.values)

# Log Loss Score of the Trained Pipeline

In [None]:
print('Log loss score of the trained pipeline on the test data', 
          log_loss_scorer(pl, X_test, y_test.values)
     )

# Predict Holdout Set and Write Submission

In [None]:
path_to_holdout_data = os.path.join(os.pardir, 'data', 'TestSet.csv')

# Load holdout data
holdout = pd.read_csv(path_to_holdout_data, index_col=0)

# make predictions
predictions = pl.predict_proba(holdout)

# format correctly in new data frame
pred_df = pd.DataFrame(columns=pd.get_dummies(df[LABELS]).columns,
                       index=holdoutindex,
                       data=predictions)

print(pred_df)