In [69]:
import boto3
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [3]:
client_s3 = boto3.client("s3")

In [19]:
s3_objects = client_s3.list_objects(Bucket="ingesting-data-super-heroes")

In [31]:
[i["Key"] for i in s3_objects["Contents"] if i["Key"].lower().find("csv_file")>=0]

['csv_file/', 'csv_file/df_info.csv']

In [32]:
client_s3.download_file(Bucket="ingesting-data-super-heroes", 
                        Filename="df_info.csv", Key="csv_file/df_info.csv")

___

In [55]:
df = pd.read_csv("df_info.csv")

In [56]:
df.head()

Unnamed: 0,name,gender,eye_color,race,hair_color,height,publisher,skin_color,alignment,weight
0,A-Bomb,Male,yellow,Human,No Hair,203.0,Marvel Comics,-,good,441.0
1,Abe Sapien,Male,blue,Icthyo Sapien,No Hair,191.0,Dark Horse Comics,blue,good,65.0
2,Abin Sur,Male,blue,Ungaran,No Hair,185.0,DC Comics,red,good,90.0
3,Abomination,Male,green,Human / Radiation,No Hair,203.0,Marvel Comics,-,bad,441.0
4,Abraxas,Male,blue,Cosmic Entity,Black,-99.0,Marvel Comics,-,bad,-99.0


In [57]:
df.shape

(734, 10)

In [58]:
df.drop(["skin_color","name"], axis=1, inplace=True)

In [59]:
df = df[(df.weight>=0) & (df.height>=0)]

In [62]:
# Turn the multiclass classification problem to binary classification problem selecting only two label.

df = df[df.publisher.isin(["Marvel Comics", "DC Comics"])]

In [68]:
df.head()

Unnamed: 0,gender,eye_color,race,hair_color,height,publisher,alignment,weight
0,Male,yellow,Human,No Hair,203.0,Marvel Comics,good,441.0
2,Male,blue,Ungaran,No Hair,185.0,DC Comics,good,90.0
3,Male,green,Human / Radiation,No Hair,203.0,Marvel Comics,bad,441.0
5,Male,blue,Human,No Hair,193.0,Marvel Comics,bad,122.0
7,Male,blue,Human,Blond,185.0,DC Comics,good,88.0


Label encoding.

In [111]:
label_encoder = LabelEncoder()

df_encoded = pd.DataFrame({})
for col in df.drop("publisher", axis=1).columns:
    if col not in ["height", "weight"]:
        df_encoded[col] = label_encoder.fit_transform(df[col])
    else:
        df_encoded[col] = df[col].values
df_encoded["publisher"] = df["publisher"].values

# No missing values.
df_encoded.isnull().any().sum()

0

Balance the dataset. Split train, test and validation.

In [114]:
# %90 train and %10 holdout dataset

df_train, df_holdout = train_test_split(df_encoded, test_size=0.1, stratify=df_encoded["publisher"])

# Split holdout data into 50% validation and 50% test

df_val, df_test = train_test_split(df_holdout, test_size=0.5, stratify=df_holdout["publisher"])

In [116]:
df_val.head()

Unnamed: 0,gender,eye_color,race,hair_color,height,alignment,weight,publisher
28,1,16,1,15,193.0,2,98.0,Marvel Comics
52,2,0,0,14,201.0,2,216.0,Marvel Comics
355,1,3,34,6,170.0,1,59.0,Marvel Comics
24,2,3,6,4,185.0,2,146.0,DC Comics
169,1,5,0,2,211.0,1,104.0,Marvel Comics


SageMaker Processing Jobs, let us parallelize the processing over many nodes in a cluster. 

In [126]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker import get_execution_role

In [128]:
role = get_execution_role()

In [131]:
processor = SKLearnProcessor(framework_version='0.20.0',
                             role=role,
                             instance_type='ml.m5.xlarge',
                             instance_count=1)

In [None]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

input_data = "s3://ingesting-data-super-heroes/csv_file/df_info.csv"

processor.run(
    code="preprocessing.py",
    inputs=[ProcessingInput(source=input_data, destination="/opt/ml/processing/input")],
    outputs=[
        ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="test_data", source="/opt/ml/processing/test"),
        ProcessingOutput(output_name="val_data", source="/opt/ml/processing/val"),
    ],
)

preprocessing_job_description = processor.jobs[-1].describe()

output_config = preprocessing_job_description["ProcessingOutputConfig"]
for output in output_config["Outputs"]:
    if output["OutputName"] == "train_data":
        preprocessed_training_data = output["S3Output"]["S3Uri"]
    if output["OutputName"] == "test_data":
        preprocessed_test_data = output["S3Output"]["S3Uri"]
    if output["OutputName"] == "val_data":
        preprocessed_val_data = output["S3Output"]["S3Uri"]

Get preprocessed data.

In [135]:
df_train = pd.read_csv(preprocessed_training_data+"/df_train.csv")
df_test = pd.read_csv(preprocessed_test_data+"/df_test.csv")
df_val = pd.read_csv(preprocessed_val_data+"/df_val.csv")

___