# 1.1 - Introducing the challenge 

# 1.2 - Exploring the data

#### > Load and preview the data

In [None]:
import pandas as pd
sample_df = pd.read_csv('sample_data.csv')
sample_df.head()

#### > Summarize the data

In [None]:
sample_df.info()

In [None]:
sample_df.describe()

# 1.3 - Looking at the datatypes

#### > Objects instead of categories

In [None]:
sample_df['label'].head()

#### > Encode labels as categories (sample data)

In [None]:
sample_df.label.head(2)

In [None]:
sample_df.label = sample_df.label.astype('category')
sample_df.label.head(2)

#### > Dummy variable encoding

In [None]:
dummies = pd.get_dummies(sample_df[['label']], prefix_sep='_')
dummies.head(2)

#### > Lambda functions 

In [None]:
square = lambda x: x*x
square(2)

#### > Encode labels as categories

In [None]:
categorize_label = lambda x: x.astype('category')
sample_df.label = sample_df[['label']].apply(categorize_label, axis= 1)
sample_df.info()

# 1.4 - How do we measure success?

#### > Computing log loss with NumPy

In [None]:
import numpy as np
def compute_log_loss(predicted, actual, eps=1e-14):
    """ Computes the logarithmic loss between predicted and
        actual when these are 1D arrays.
        :param predicted: The predicted probabilities as floats between 0-1
        :param actual: The actual binary labels. Either 0 or 1.
        :param eps (optional): log(0) is inf, so we need to offset our
        predicted values slightly by eps from 0 or 1.
    """
    predicted = np.clip(predicted, eps, 1 - eps)
    loss = -1 * np.mean(actual * np.log(predicted) + (1 - actual)* np.log(1 - predicted))
    return loss

In [None]:
compute_log_loss(predicted=0.9, actual=0)

In [None]:
compute_log_loss(predicted=0.5, actual=1)

# 2.1 - It's time to build a model

#### > Splitting the data

In [None]:
data_to_train = df[NUMERIC_COLUMNS].fillna(-1000)
labels_to_use = pd.get_dummies(df[LABELS])
X_train, X_test, y_train, y_test = multilabel_train_test_split(data_to_train,labels_to_use,size=0.2, seed=123)

#### > Training the model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
clf = OneVsRestClassifier(LogisticRegression())
clf.fit(X_train, y_train)

# 2.2 - Making predictions

#### > Predicting on holdout data

In [None]:
holdout = pd.read_csv('HoldoutData.csv', index_col=0)
holdout = holdout[NUMERIC_COLUMNS].fillna(-1000)
predictions = clf.predict_proba(holdout)

#### > Format and submit predictions

In [None]:
prediction_df = pd.DataFrame(columns=pd.get_dummies(df[LABELS],prefix_sep='__').columns,index=holdout.index,data=predictions)
prediction_df.to_csv('predictions.csv')
score = score_submission(pred_path='predictions.csv')

# 2.3 - A very brief introduction to NLP

# 2.4 - Representing text numerically

#### > Using CountVectorizer() on column of main dataset

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
TOKENS_BASIC = '\\\\S+(?=\\\\s+)'
df.Program_Description.fillna('' , inplace=True)
vec_basic = CountVectorizer(token_pattern=TOKENS_BASIC)

In [None]:
vec_basic.fit(df.Program_Description)

In [None]:
msg = 'There are {} tokens in Program_Description if tokens are any non-whitespace'
print(msg.format(len(vec_basic.get_feature_names())))

# 3.1 - Pipelines, feature & text preprocessing

#### > Instantiate simple pipeline with one step

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

In [None]:
pl = Pipeline([
    ('clf', OneVsRestClassifier(LogisticRegression()))])

#### > Train and test with sample numeric data

In [None]:
sample_df.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sample_df[['numeric']],pd.get_dummies(sample_df['label']),random_state=2)
pl.fit(X_train, y_train)

In [None]:
accuracy = pl.score(X_test, y_test)
print('accuracy on numeric data, no nans: ', accuracy)

#### > Adding more steps to the pipeline

In [None]:
X_train, X_test, y_train, y_test = train_test_split(sample_df[['numeric','with_missing']], 
                                                    pd.get_dummies(sample_df['label']), random_state
pl.fit(X_train, y_train)

#### > Preprocessing numeric features with missing data

In [None]:
from sklearn.preprocessing import Imputer
X_train, X_test, y_train, y_test = train_test_split(sample_df[['numeric', 'with_missing']],
                                                    pd.get_dummies(sample_df['label']),random_state=2)
pl = Pipeline([('imp', Imputer()),('clf', OneVsRestClassifier(LogisticRegression()))
])

In [None]:
pipeline.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)
print('accuracy on all numeric, incl nans: ', accuracy)

# 3.2 - Text features and feature unions

#### > Preprocessing text features

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
X_train, X_test, y_train, y_test = train_test_split(sample_df['text']
                                                    pd.get_dummies(sample_df['label']
                                                    random_state=2)
pl = Pipeline([('vec', CountVectorizer()),('clf', OneVsRestClassifier(LogisticRegression()))
])

In [None]:
pl.fit(X_train, y_train)

In [None]:
accuracy = pl.score(X_test, y_test)
print('accuracy on sample data: ', accuracy)

#### > Putting it all together

In [None]:
X_train, X_test, y_train, y_test = train_test_split(sample_df[['numeric','with_missing', 'text']], 
                                                    pd.get_dummies(sample_df['label']), random_state=2)
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion

In [None]:
get_text_data = FunctionTransformer(lambda x: x['text'],validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[['numeric','with_missing']], validate=False)

#### > FeatureUnion Text and Numeric Features

In [None]:
from sklearn.pipeline import FeatureUnion
union = FeatureUnion([
            ('numeric', numeric_pipeline),
            ('text', text_pipeline)
        ])

#### > Putting it all together

In [None]:
numeric_pipeline = Pipeline([
                        ('selector', get_numeric_data),
                        ('imputer', Imputer())
                    ])
text_pipeline = Pipeline([
                        ('selector', get_text_data),
                        ('vectorizer', CountVectorizer())
])
pl = Pipeline([
                        ('union', FeatureUnion([
                        ('numeric', numeric_pipeline),
                        ('text', text_pipeline)
                        ])),
                        ('clf', OneVsRestClassifier(LogisticRegression()))
                        ])

# 3.3 - Choosing a classication model

#### > Main dataset: lots of text

In [None]:
LABELS = ['Function', 'Use', 'Sharing', 'Reporting', 'Student_Type',
            'Position_Type', 'Object_Type', 'Pre_K', 'Operating_Status']
NON_LABELS = [c for c in df.columns if c not in LABELS]
len(NON_LABELS) - len(NUMERIC_COLUMNS)

#### > Using pipeline with the main dataset

In [None]:
import numpy as np
import pandas as pd
df = pd.read_csv('TrainingSetSample.csv', index_col=0)
dummy_labels = pd.get_dummies(df[LABELS])
X_train, X_test, y_train, y_test = multilabel_train_test_split(
                                    df[NON_LABELS], dummy_labels,0.2)

In [None]:
get_text_data = FunctionTransformer(combine_text_columns,validate=False)
get_numeric_data = FunctionTransformer(lambda x:x[NUMERIC_COLUMNS], validate=False)
pl = Pipeline([
            ('union', FeatureUnion([
                    ('numeric_features', Pipeline([
                        ('selector', get_numeric_data),
                        ('imputer', Imputer())
                    ])),
                    ('text_features', Pipeline([
                        ('selector', get_text_data),
                        ('vectorizer', CountVectorizer())
                    ]))
                ])
            ),
            ('clf', OneVsRestClassifier(LogisticRegression()))
        ])

In [None]:
pl.fit(X_train, y_train)

#### > Easily try new models using pipeline

In [None]:
from sklearn.ensemble import RandomForestClassifier
pl = Pipeline([
            ('union', FeatureUnion([
                    ('numeric_features', Pipeline([
                        ('selector', get_numeric_data),
                        ('imputer', Imputer())
                    ])),
                    ('text_features', Pipeline([
                        ('selector', get_text_data),
                        ('vectorizer', CountVectorizer())
                    ]))
                ])
            ),
            ('clf', OneVsRest(RandomForestClassifier()))
        ])

# 4.1 - Learning from the expert: processing

#### > N-grams and tokenization

In [None]:
vec = CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                        ngram_range=(1, 2))

#### > Range of n-grams in scikit-learn

In [None]:
pl.fit(X_train, y_train)

In [None]:
holdout = pd.read_csv('HoldoutData.csv', index_col=0)
predictions = pl.predict_proba(holdout)
prediction_df = pd.DataFrame(columns=pd.get_dummies(df[LABELS]).columns, index=holdout.index,data=predictions)
prediction_df.to_csv('predictions.csv')
score = score_submission(pred_path='predictions.csv')

# 4.2 - Learning from the expert: a stats trick

#### > Adding interaction features with scikit-learn

In [None]:
from sklearn.preprocessing import PolynomialFeatures
x

In [None]:
interaction = PolynomialFeatures(degree=2,interaction_only=True,include_bias=False)
interaction.fit_transform(x)

#### > Sparse interaction features

In [None]:
SparseInteractions(degree=2).fit_transform(x).toarray()

# 4.3 - Learning from the expert: the winning model

#### > Implementing the hashing trick in scikit-learn

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
vec = HashingVectorizer(norm=None,
                        non_negative=True,
                        token_pattern=TOKENS_ALPHANUMERIC,
                        ngram_range=(1, 2))