# DataOps with ClearML Pt.I 
## (ClearSHOW S02E05)
Simple feature engineering with ClearML as a data store. 


In [5]:


# normal imports
from typing import Optional
import pandas as pd

## First things first, init a task on the project.
#### That's our famous original 2-LOC integration, now with subprojects :)

In [7]:
from clearml import Task, Dataset

task = Task.init(project_name="titanic_demo/FeatureStore", task_name="feature_set_2")


Can't get url information for git repo in /Users/thisadee_pre/play/clearml-lab/.venv/lib/python3.11/site-packages


ClearML Task: created new task id=68a52540085345c7b7f50ab47781e9b7
ClearML results page: https://app.clear.ml/projects/668216dab55b4d5f9ebbc4c220fb886f/experiments/68a52540085345c7b7f50ab47781e9b7/output/log


ClearML Monitor: GPU monitoring failed getting GPU reading, switching off GPU monitoring


## Now let's get the dataset from our datastore
refer to S02E04 to see how we downloaded it from kaggle and put it there. AKA clearml-data rul3z!!1

In [9]:
tdata = Dataset.get(dataset_project="titanic_demo/dataset",dataset_name="titanic")
tdata_folder = tdata.get_local_copy()


### The dataset has been downloaded to the local machine and can now be loaded.
#### Don't forget to uses pandas to read the csv ;) 

# Real stuff starts here 
now we want to make some cleaning and feature engineering [as suggested by](https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy) the top 
    [kaggle kernels](https://www.kaggle.com/startupsci/titanic-data-science-solutions)
 for this data

## The follwing defs are meant to look like you copy-pasted it from someone else :) 


In [10]:
def extract_title(in_name_series: pd.Series) -> pd.Series:
    return in_name_series.str.extract(' ([A-Za-z]+)\.', expand=False)

def make_fixed_title_series(
    in_title_series: pd.Series, override_rare_list: Optional[list] = None
) -> pd.Series:
    rare_list = (
        [
            "Lady",
            "Countess",
            "Capt",
            "Col",
            "Don",
            "Dr",
            "Major",
            "Rev",
            "Sir",
            "Jonkheer",
            "Dona",
        ]
        if override_rare_list is None
        else override_rare_list
    )
    fixed_title_series = in_title_series.replace(rare_list, "Rare")
    fixed_title_series = fixed_title_series.replace(["Mlle","Ms"], "Miss")
    fixed_title_series = fixed_title_series.replace("Mme", "Mrs")
    return fixed_title_series

def map_title(title_series: pd.Series, mapping: dict) -> pd.Series:
   rev_mapping = {v:k for k,v in mapping.items()} 
   mapped = title_series.map(rev_mapping)
   # shouldn't happen
   mapped = mapped.fillna(0)
   return mapped

ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start


# Create a categorical column for the passenger titles and document your work

In [11]:
train_df = pd.read_csv(tdata_folder+'/train.csv')
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [12]:
num_to_title = {1: "Mr", 2: "Miss", 3: "Mrs", 4: "Master", 5: "Rare"}
Use_actual_cat = False

def create_categorical_title(
    name_series: pd.Series, num_to_title_mapping: dict
) -> pd.Series:
    title = extract_title(name_series)
    fixed_title = make_fixed_title_series(title)
    cat_title = map_title(fixed_title, num_to_title_mapping)
    if Use_actual_cat:
        cat_title = cat_title.astype("category")
    return cat_title

train_df["Title"] = create_categorical_title(train_df["Name"], num_to_title)




In [13]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,3
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,3
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1


## Make sure you log some of your process in the task itself:

In [14]:
# expose the mapping so that it can be viewed and grabbed later on
task.upload_artifact("key_category_to_title", num_to_title)
# 
sanity_check = train_df[["Title", "Survived"]].groupby(["Title"], as_index=False).mean()
sanity_check = sanity_check.set_index('Title', drop=True)
task.logger.report_table('survival','categorical',table_plot=sanity_check)

# Now let's create a binary diff over the original dataset which contains the new feature
# step 1 - create the feature set


In [22]:
with_feature = Dataset.create('name does not matter - the task is the feature',
                              dataset_project='titanic_example/FeatureStore',
                              parent_datasets=[tdata.id],
                              use_current_task=True)   # This boolean is the main point actually!!!

ClearML results page: https://app.clear.ml/projects/3939e260f6c24848932092cc225b2726/experiments/68a52540085345c7b7f50ab47781e9b7/output/log
ClearML dataset page: https://app.clear.ml/datasets/simple/3939e260f6c24848932092cc225b2726/experiments/68a52540085345c7b7f50ab47781e9b7


In [23]:
from tempfile import mkdtemp
#create a new folder
new_folder = mkdtemp()

train_df.to_csv(new_folder+'/train.csv', index=False)
# with_feature.add_external_files(new_folder+'/train.csv')
with_feature.sync_folder(new_folder)
with_feature.upload()
with_feature.finalize()


Generating SHA2 hash for 1 files
Hash generation completed
Uploading dataset changes (1 files compressed to 22.14 KiB) to https://files.clear.ml
File compression and upload completed: total size 22.14 KiB, 1 chunk(s) stored (average size 22.14 KiB)


True

In [24]:
with_feature.finalize()


True

Generating SHA2 hash for 1 files
Hash generation completed
File compression and upload completed: total size 0 B, 0 chunk(s) stored (average size 0 B)


True

In [19]:
from tempfile import mkdtemp
new_folder = with_feature.get_mutable_local_copy(mkdtemp())
print(f'new_folder is:{new_folder}')
# overwrite with new train df (with the added)
train_df.to_csv(new_folder+'/train.csv', index=False)
with_feature.sync_folder(new_folder)
with_feature.upload()
with_feature.finalize()



ValueError: Cannot get a local copy of a dataset that was not finalized/closed

In [26]:
task.close()


# Wait, is that it?!
