In [1]:
from pandas import Series, DataFrame
import pandas as pd
from patsy import dmatrices
%pylab inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv('aac_intakes_outcomes.csv')
df['cat_or_dog'] = False

def valid_animal(x):
    valid = ['Dog','Cat']
    if x in valid:
        return True
    return False

df['cat_or_dog'] = df['animal_type'].apply(valid_animal)



mask = (df['cat_or_dog'] == True)
df = df[mask]

colors = df['color'].value_counts()>100
color_list = df['color'].value_counts()[colors].index.tolist()
df['color_bin'] = 'other'
def colortest(x):
    if x in color_list:
        return x
    return 'other'
df['color_bin'] = df['color'].apply(colortest)


In [3]:
categorical_columns_to_convert = ['intake_type', 'sex_upon_intake', 'intake_weekday', 'breed', 'color_bin', 'intake_condition', 'sex_upon_outcome', 'outcome_weekday']

new_categorical_columns = []

for column in categorical_columns_to_convert:
    df[column] = df[column].astype('category')
    new_column_name = column + "_coded"
    new_categorical_columns.append(new_column_name)
    df[new_column_name] = df[column].cat.codes

categorical_columns = ['intake_month', 'intake_hour', 'outcome_month', 'outcome_hour', 'dob_month']

all_categorical_columns = new_categorical_columns + categorical_columns                              
                                
numerical_columns = ['dob_year', 'age_upon_intake_(days)', 'age_upon_outcome_(days)', 'outcome_year', 'intake_year', 'time_in_shelter_days']

In [4]:
df = df[df['outcome_type'] != 'Return to Owner']

df['target'] = 0
mask = (df['outcome_type']=='Adoption')
df['target'][mask] = 1.0

def formula(numerical, categorical):
    formula = 'target ~ 0'
    for numerical_feature in numerical:
        formula += ' + Q("' + numerical_feature + '")'
    for categorical_feature in categorical:
        formula += ' + C(' + categorical_feature + ')'
    return formula
    
formula = formula(numerical_columns, all_categorical_columns)
formula

'target ~ 0 + Q("dob_year") + Q("age_upon_intake_(days)") + Q("age_upon_outcome_(days)") + Q("outcome_year") + Q("intake_year") + Q("time_in_shelter_days") + C(intake_type_coded) + C(sex_upon_intake_coded) + C(intake_weekday_coded) + C(breed_coded) + C(color_bin_coded) + C(intake_condition_coded) + C(sex_upon_outcome_coded) + C(outcome_weekday_coded) + C(intake_month) + C(intake_hour) + C(outcome_month) + C(outcome_hour) + C(dob_month)'

In [5]:

Y, X = dmatrices(formula, df, return_type='dataframe')
y = Y['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
model = LogisticRegression()
result = model.fit(X_train, y_train)
prediction_train = model.predict(X_train)
prediction_test = model.predict(X_test)

print 'Training accuracy = ', metrics.accuracy_score(y_train, prediction_train)
print 'Test accuracy = ', metrics.accuracy_score(y_test, prediction_test)

negative_examples_in_test = len(y_test[y_test==0])
total_examples_in_test = len(y_test)
print 'Baseline accuracy =', negative_examples_in_test * 1.0 / total_examples_in_test

weights = Series(model.coef_[0],
                 index=X.columns.values)
weights.sort_values()

Training accuracy =  0.842823898176
Test accuracy =  0.84165327719
Baseline accuracy = 0.44739320738


C(sex_upon_outcome_coded)[T.4]   -1.355948
C(sex_upon_intake_coded)[T.4]    -1.355948
C(sex_upon_outcome_coded)[T.1]   -1.252557
C(outcome_hour)[T.9]             -0.854852
C(outcome_month)[T.4]            -0.591048
C(sex_upon_intake_coded)[T.3]    -0.534178
C(intake_type_coded)[2]          -0.532651
C(breed_coded)[T.1768]           -0.531950
C(intake_type_coded)[0]          -0.488031
C(intake_condition_coded)[T.7]   -0.478819
C(breed_coded)[T.1521]           -0.439102
C(breed_coded)[T.910]            -0.406648
C(outcome_month)[T.5]            -0.327027
C(breed_coded)[T.37]             -0.320214
C(intake_hour)[T.20]             -0.319859
C(breed_coded)[T.905]            -0.283507
C(breed_coded)[T.74]             -0.281871
C(intake_condition_coded)[T.2]   -0.278893
C(breed_coded)[T.63]             -0.272684
C(outcome_month)[T.3]            -0.270434
C(breed_coded)[T.1260]           -0.256703
C(color_bin_coded)[T.61]         -0.254315
C(breed_coded)[T.1290]           -0.243434
C(breed_cod

In [6]:
df_dogs = df[df['animal_type'] == 'Dog']

Y_dog, X_dog = dmatrices(formula, df_dogs, return_type='dataframe')
y_dog = Y_dog['target'].values
X_train_dog, X_test_dog, y_train_dog, y_test_dog = train_test_split(X_dog, y_dog, test_size=0.3, random_state=1)
model_dog = LogisticRegression()
result = model_dog.fit(X_train_dog, y_train_dog)
prediction_train_dog = model_dog.predict(X_train_dog)
prediction_test_dog = model_dog.predict(X_test_dog)

print 'Training accuracy = ', metrics.accuracy_score(y_train_dog, prediction_train_dog)
print 'Test accuracy = ', metrics.accuracy_score(y_test_dog, prediction_test_dog)

negative_examples_in_test_dog = len(y_test_dog[y_test_dog==0])
total_examples_in_test_dog = len(y_test_dog)
print 'Baseline accuracy =', negative_examples_in_test_dog * 1.0 / total_examples_in_test_dog

weights_dog = Series(model_dog.coef_[0],
                 index=X_dog.columns.values)
weights_dog.sort_values()

Training accuracy =  0.819719563766
Test accuracy =  0.816265060241
Baseline accuracy = 0.360303282094


C(intake_type_coded)[0]          -1.008862
C(breed_coded)[T.1768]           -0.948568
C(sex_upon_outcome_coded)[T.4]   -0.849946
C(sex_upon_intake_coded)[T.4]    -0.849946
C(sex_upon_outcome_coded)[T.1]   -0.718730
C(intake_condition_coded)[T.7]   -0.603117
C(breed_coded)[T.1767]           -0.521230
C(breed_coded)[T.427]            -0.494711
C(breed_coded)[T.1290]           -0.493054
C(breed_coded)[T.881]            -0.455489
C(breed_coded)[T.74]             -0.432918
C(breed_coded)[T.1521]           -0.432260
C(breed_coded)[T.1627]           -0.422775
C(breed_coded)[T.1260]           -0.420624
C(color_bin_coded)[T.28]         -0.398670
C(breed_coded)[T.1569]           -0.395379
C(breed_coded)[T.63]             -0.389915
C(breed_coded)[T.1069]           -0.383481
C(color_bin_coded)[T.35]         -0.363200
C(intake_condition_coded)[T.2]   -0.362646
C(breed_coded)[T.742]            -0.354721
C(outcome_hour)[T.10]            -0.337153
C(intake_weekday_coded)[T.5]     -0.333971
C(breed_cod

In [7]:
df_cats = df[df['animal_type'] == 'Cat']

Y_cat, X_cat = dmatrices(formula, df_cats, return_type='dataframe')
y_cat = Y_cat['target'].values
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(X, y, test_size=0.3, random_state=1)
model_cat = LogisticRegression()
result = model_cat.fit(X_train_cat, y_train_cat)
prediction_train_cat = model_cat.predict(X_train_cat)
prediction_test_cat = model_cat.predict(X_test_cat)

print 'Training accuracy = ', metrics.accuracy_score(y_train_cat, prediction_train_cat)
print 'Test accuracy = ', metrics.accuracy_score(y_test_cat, prediction_test_cat)

negative_examples_in_test_cat = len(y_test_cat[y_test_cat==0])
total_examples_in_test_cat = len(y_test_cat)
print 'Baseline accuracy =', negative_examples_in_test_cat * 1.0 / total_examples_in_test_cat

weights_cat = Series(model_cat.coef_[0],
                 index=X_cat.columns.values)
weights_cat.sort_values()

Training accuracy =  0.842823898176
Test accuracy =  0.84165327719
Baseline accuracy = 0.44739320738


ValueError: Wrong number of items passed 1934, placement implies 245

In [None]:
df['sex_upon_outcome']