In [1]:
%pylab inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Populating the interactive namespace from numpy and matplotlib


In [2]:

def read_test_train():

    print("Read people.csv...")
    people = pd.read_csv(r'C:\Users\piush\Desktop\Dataset\RedHat\people.csv',
                       dtype={'people_id': np.str,
                              'activity_id': np.str,
                              'char_38': np.int32},
                       parse_dates=['date'])

    print("Load train.csv...")
    train = pd.read_csv(r'C:\Users\piush\Desktop\Dataset\RedHat\act_train.csv',
                        dtype={'people_id': np.str,
                               'activity_id': np.str,
                               'outcome': np.int8},
                        parse_dates=['date'])

    print("Load test.csv...")
    test = pd.read_csv(r'C:\Users\piush\Desktop\Dataset\RedHat\act_test.csv',
                       dtype={'people_id': np.str,
                              'activity_id': np.str},
                       parse_dates=['date'])

    print("Process tables...")
    for table in [train, test]:
        table['year'] = table['date'].dt.year
        table['month'] = table['date'].dt.month
        table['day'] = table['date'].dt.day
        table.drop('date', axis=1, inplace=True)
        table['activity_category'] = table['activity_category'].str.lstrip('type ').astype(np.int32)
        for i in range(1, 11):
            table['char_' + str(i)].fillna('type 0', inplace=True)
            table['char_' + str(i)] = table['char_' + str(i)].str.lstrip('type ').astype(np.int32)

    people['year'] = people['date'].dt.year
    people['month'] = people['date'].dt.month
    people['day'] = people['date'].dt.day
    people.drop('date', axis=1, inplace=True)
    people['group_1'] = people['group_1'].str.lstrip('group ').astype(np.int32)
    for i in range(1, 10):
        people['char_' + str(i)] = people['char_' + str(i)].str.lstrip('type ').astype(np.int32)
    for i in range(10, 38):
        people['char_' + str(i)] = people['char_' + str(i)].astype(np.int32)

    print("Merge...")
    train = pd.merge(train, people, how='left', on='people_id', left_index=True)
    train.fillna(0.0, inplace=True)
    test = pd.merge(test, people, how='left', on='people_id', left_index=True)
    test.fillna(0.0, inplace=True)

    
    return train, test

In [3]:
train, test = read_test_train()
print('Length of train: ', len(train))
print('Length of test: ', len(test))
#print('Features [{}]: {}'.format(len(features), sorted(features)))


Read people.csv...
Load train.csv...
Load test.csv...
Process tables...
Merge...
Length of train:  2197291
Length of test:  498687


In [4]:
train.head(20)

Unnamed: 0,people_id,activity_id,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,...,char_32,char_33,char_34,char_35,char_36,char_37,char_38,year_y,month_y,day_y
0,ppl_100,act2_1734928,4,0,0,0,0,0,0,0,...,0,0,1,1,1,0,36,2021,6,29
0,ppl_100,act2_2434093,2,0,0,0,0,0,0,0,...,0,0,1,1,1,0,36,2021,6,29
0,ppl_100,act2_3404049,2,0,0,0,0,0,0,0,...,0,0,1,1,1,0,36,2021,6,29
0,ppl_100,act2_3651215,2,0,0,0,0,0,0,0,...,0,0,1,1,1,0,36,2021,6,29
0,ppl_100,act2_4109017,2,0,0,0,0,0,0,0,...,0,0,1,1,1,0,36,2021,6,29
0,ppl_100,act2_898576,4,0,0,0,0,0,0,0,...,0,0,1,1,1,0,36,2021,6,29
1,ppl_100002,act2_1233489,2,0,0,0,0,0,0,0,...,1,1,1,1,1,0,76,2021,1,6
1,ppl_100002,act2_1623405,2,0,0,0,0,0,0,0,...,1,1,1,1,1,0,76,2021,1,6
2,ppl_100003,act2_1111598,2,0,0,0,0,0,0,0,...,1,1,1,0,1,1,99,2022,6,10
2,ppl_100003,act2_1177453,2,0,0,0,0,0,0,0,...,1,1,1,0,1,1,99,2022,6,10


In [5]:
train = train.drop(['people_id'],axis =1)

In [6]:
train['activity_id'] = train['activity_id'].factorize()[0]

In [7]:
train.head(5)

Unnamed: 0,activity_id,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,char_8_x,...,char_32,char_33,char_34,char_35,char_36,char_37,char_38,year_y,month_y,day_y
0,0,4,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,36,2021,6,29
0,1,2,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,36,2021,6,29
0,2,2,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,36,2021,6,29
0,3,2,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,36,2021,6,29
0,4,2,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,36,2021,6,29


In [8]:
from sklearn.cross_validation import train_test_split
train_target = train['outcome']
train_data = train.drop(['outcome'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(train_data, train_target, test_size=0.25, random_state=33)

In [9]:
from sklearn import tree
dt = tree.DecisionTreeClassifier(criterion='entropy')
dt = dt.fit(X_train, y_train)

In [10]:
from sklearn import metrics
def measure_performance(X, y, clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
    y_pred = clf.predict(X)   
    if show_accuracy:
         print ("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
    if show_classification_report:
        print ("Classification report")
        print (metrics.classification_report(y, y_pred),"\n")
      
    if show_confussion_matrix:
        print ("Confussion matrix")
        print (metrics.confusion_matrix(y, y_pred),"\n")

In [11]:
measure_performance(X_test, y_test, dt, show_confussion_matrix=False, show_classification_report=False)

Accuracy:0.991 



In [12]:
from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=20)
X_train_fs = fs.fit_transform(X_train, y_train)
print (train_data.columns[fs.get_support()])

Index(['activity_id', 'char_10_x', 'group_1', 'char_3_y', 'char_4_y',
       'char_7_y', 'char_13', 'char_22', 'char_34', 'char_36', 'char_37',
       'char_38'],
      dtype='object')


In [13]:
print (fs.scores_[2])
#print (titanic_data.columns[2])


4651.00644916
