In [1]:
%%file submissions/px/feature_extractor.py
# -*- coding: utf-8 -*-

from __future__ import unicode_literals

from sklearn.feature_extraction.text import TfidfVectorizer


def document_preprocessor(doc):
    """ A custom document preprocessor

    This function can be edited to add some additional
    transformation on the documents prior to tokenization.

    At present, this function passes the document through
    without modification.
    """
    return doc


def token_processor(tokens):
    """ A custom token processor

    This function can be edited to add some additional
    transformation on the extracted tokens (e.g. stemming)

    At present, this function just passes the tokens through.
    """
    for t in tokens:
        yield t


class FeatureExtractor(TfidfVectorizer):
    """Convert a collection of raw docs to a matrix of TF-IDF features. """

    def __init__(self):
        # see ``TfidfVectorizer`` documentation for other feature
        # extraction parameters.
        super(FeatureExtractor, self).__init__(
                analyzer='word', preprocessor=document_preprocessor)

    def fit(self, X_df, y=None):
        """Learn a vocabulary dictionary of all tokens in the raw documents.

        Parameters
        ----------
        X_df : pandas.DataFrame
            a DataFrame, where the text data is stored in the ``statement``
            column.
        """
     
        super(FeatureExtractor, self).fit(X_df.posts)
        return self

    def fit_transform(self, X_df, y=None):
        return self.fit(X_df).transform(X_df)

    def transform(self, X_df):
        X = super(FeatureExtractor, self).transform(X_df.posts)
        return X

    def build_tokenizer(self):
        """
        Internal function, needed to plug-in the token processor, cf.
        http://scikit-learn.org/stable/modules/feature_extraction.html#customizing-the-vectorizer-classes
        """
        tokenize = super(FeatureExtractor, self).build_tokenizer()
        return lambda doc: list(token_processor(tokenize(doc)))




Overwriting submissions/px/feature_extractor.py


In [39]:
%%file submissions/px/classifier.py
# -*- coding: utf-8 -*-
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier


class Classifier(BaseEstimator):
    def __init__(self):
        self.clf = RandomForestClassifier()

    def fit(self, X, y):
        #print(y)
        self.clf.fit(X.todense(), y)

    def predict(self, X):
        return self.clf.predict(X.todense())

    def predict_proba(self, X):
        #print(X.shape)
        y = self.clf.predict_proba(X)
        #print(y.shape)
        return y



Overwriting submissions/px/classifier.py


In [43]:
!ramp_test_submission --submission px

[38;5;178m[1mTesting personality prediction[0m
[38;5;178m[1mReading train and test files from ./data ...[0m
[38;5;178m[1mReading cv ...[0m
[38;5;178m[1mTraining ./submissions/px ...[0m
[38;5;178m[1mCV fold 0[0m
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
	[38;5;178m[1mscore   sacc    acc  tfacc[0m
	[38;5;10m[1mtrain[0m  [38;5;10m[1m[38;5;150m0.697[0m[0m  [38;5;150m0.989[0m  [38;5;10m[1m[38;5;150m0.697[0m[0m
	[38;5;12m[1mvalid[0m  [38;5;12m[1m[38;5;105m0.174[0m[0m  [38;5;105m0.200[0m  [38;5;12m[1m[38;5;105m0.174[0m[0m
	[38;5;1m[1mtest[0m   [38;5;1m[1m[38;5;218m0.632[0m[0m  [38;5;218m0.890[0m  [38;5;1m[1m[38;5;218m0.632[0m[0m
[38;5;178m[1mCV fold 1[0m
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
	[38;5;178m[1mscore   sacc    acc  tfacc[0m
	[38;5;10m[1mtrain[0m  [38;5;10m[1m[38;5;150m0.688[0m[0m  [38;5;150m0.991[

In [5]:
pwd

'/Users/panxiao/Desktop/data_camp/mbti_kaggle'