### Step 1
read Data

In [8]:
import pandas as pd

class DataSet:
    def __init__(self, RawX=None, X=None, Y=None, File=None):
        self.RawX = RawX
        self.X = X
        self.Y = Y
        self.File = File

trainSet = DataSet(File="assets/adult_train.csv")
testSet = DataSet(File="assets/adult_test.csv")
datasets = [trainSet, testSet]

for dset in datasets:
    csvdat = pd.read_csv(dset.File)
    data_shuffled = csvdat.sample(frac=1.0, random_state=0)
    
    dset.RawX = data_shuffled.drop('target', axis=1)
    dset.Y = data_shuffled['target']

In [9]:
# Train Preview
print(trainSet.Y.head())
trainSet.RawX.head()

22278    <=50K
8950     <=50K
7838     <=50K
16505    <=50K
19140     >50K
Name: target, dtype: object


Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
22278,49,Local-gov,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States
8950,49,Private,HS-grad,9,Divorced,Other-service,Not-in-family,Black,Female,0,0,40,United-States
7838,31,Private,Prof-school,15,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,50,United-States
16505,53,Private,Some-college,10,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,43,United-States
19140,30,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States


### Step 2
Dict Vectorizer

In [10]:
from sklearn.feature_extraction import DictVectorizer
import json
 
xTrainRecords = trainSet.RawX.to_dict("records")
vectorizer = DictVectorizer(sparse=False)
xTrainEncoded = vectorizer.fit_transform(xTrainRecords)

testDict = testSet.RawX.to_dict("records")
testFit = vectorizer.transform(testDict)
testUnVectorized = vectorizer.inverse_transform(testFit)

print("As Dict", json.dumps(testDict[0], indent = 4))
print("Unvectorized ", json.dumps(testUnVectorized[0], indent = 4))

As Dict {
    "age": 44,
    "workclass": "Private",
    "education": "Some-college",
    "education-num": 10,
    "marital-status": "Married-civ-spouse",
    "occupation": "Transport-moving",
    "relationship": "Husband",
    "race": "White",
    "sex": "Male",
    "capital-gain": 0,
    "capital-loss": 0,
    "hours-per-week": 48,
    "native-country": "United-States"
}
Unvectorized  {
    "age": 44.0,
    "education-num": 10.0,
    "education=Some-college": 1.0,
    "hours-per-week": 48.0,
    "marital-status=Married-civ-spouse": 1.0,
    "native-country=United-States": 1.0,
    "occupation=Transport-moving": 1.0,
    "race=White": 1.0,
    "relationship=Husband": 1.0,
    "sex=Male": 1.0,
    "workclass=Private": 1.0
}


In [11]:
#Data to dictionary
for dset in datasets:
    dset.X = dset.RawX.to_dict("records")

### Step 3
pipelining it

In [12]:
import sklearn as sk
from sklearn import tree
from sklearn.pipeline import make_pipeline
  
task1pipeline = sk.pipeline.make_pipeline(
  DictVectorizer(sparse=False),
  sk.tree.DecisionTreeClassifier()
)

task1pipeline.fit(trainSet.X, trainSet.Y)

In [13]:
#Test pipeline
from sklearn.model_selection import cross_val_score
from statistics import mean

scores = cross_val_score(task1pipeline, trainSet.X, trainSet.Y)
mean(scores)

In [None]:
# Peek Results
from IPython.display import display

testRawXCopy = testSet.RawX.copy()

testRawXCopy["result"] = testSet.Y
testRawXCopy["predict"] = task1pipeline.predict(testSet.X)

display(testRawXCopy)

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,result,predict
15729,44,Private,Some-college,10,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,48,United-States,>50K,<=50K
7077,37,Private,HS-grad,9,Divorced,Exec-managerial,Unmarried,White,Female,0,0,50,United-States,<=50K,<=50K
14946,35,Private,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Male,0,0,45,United-States,<=50K,<=50K
9416,38,State-gov,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Female,0,0,45,United-States,<=50K,<=50K
5739,41,Private,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,5013,0,30,United-States,<=50K,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13123,71,?,HS-grad,9,Widowed,?,Unmarried,White,Female,6612,0,42,United-States,>50K,<=50K
3264,30,State-gov,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,45,United-States,<=50K,>50K
9845,54,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K,>50K
10799,44,Self-emp-inc,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,45,United-States,>50K,>50K


# Task 2: Decision trees and random forests
