# 03-dagshub-data-versioning

## Initial data split to train, val and test sets with

In [19]:
import pandas as pd

raw_df = pd.read_csv("../data/raw/informal_formal_synthetic_raw.csv")
raw_df.head()

Unnamed: 0,zdanie_nieformalne,zdanie_formalne,model,generator
0,"Siema, co tam słychać?","Dzień dobry, jak się Pan/Pani miewa?",gemini-2.5,seba
1,Trzeba to ogarnąć na jutro.,Należy to przygotować/zrealizować do jutra.,gemini-2.5,seba
2,"Daj znać, jak coś będziesz wiedział.","Proszę o informację, gdy tylko uzyska Pan/Pani...",gemini-2.5,seba
3,Ta fura jest mega wypasiona.,Ten samochód jest bardzo dobrze wyposażony.,gemini-2.5,seba
4,Nie kumam tej nowej apki.,Nie rozumiem działania tej nowej aplikacji.,gemini-2.5,seba


In [36]:
from sklearn.model_selection import train_test_split

# Split the data into train, validation, and test sets with 70/10/20 ratio
train_df, test_val_df = train_test_split(
    raw_df, test_size=0.3, random_state=42, stratify=raw_df["generator"]
)
test_df, val_df = train_test_split(
    test_val_df, test_size=1 / 3, random_state=42, stratify=test_val_df["generator"]
)

train_df.shape[0] / raw_df.shape[0], val_df.shape[0] / raw_df.shape[0], test_df.shape[
    0
] / raw_df.shape[0]

(0.7, 0.1, 0.2)

In [37]:
train_df["split"] = "train"
val_df["split"] = "val"
test_df["split"] = "test"

data = pd.concat([train_df, val_df, test_df]).reset_index(drop=True)
data.to_csv("../data/processed/informal_formal_synthetic_v1.csv", index=False)

## Upload dataset to DagsHub



In [88]:
# Upload using the DagsHub client, to a DVC tracked folder also called "data".
# Follow the instructions that appear to authorize the request.
from dagshub import upload_files

upload_files(
    repo="informal2formal/mlflow",
    local_path="../data/processed",
    remote_path="synthetic_data",
)

## Create a Data Source from the uploaded dataset



In [89]:
from dagshub.data_engine import datasources

ds = datasources.create_datasource(
    "informal2formal/mlflow", "synthetic_data_source", "synthetic_data"
)

In [90]:
# Shortly after datasource creation, you should be able to see detected files.
ds.head().dataframe

Unnamed: 0,path,datapoint_id,dagshub_download_url,media type,size
0,informal_formal_synthetic_v1.csv,86087877,https://dagshub.com/api/v1/repos/informal2form...,text/plain,1221781


For more information on how to use this new datasource, follow the instructions on:  
https://dagshub.com/informal2formal/mlflow/datasets

## Load the Data Source from DagsHub

In [91]:
from dagshub.data_engine import datasources

ds = datasources.get("informal2formal/mlflow", "synthetic_data_source")
ds.head().dataframe

Unnamed: 0,path,datapoint_id,dagshub_download_url,media type,size
0,informal_formal_synthetic_v1.csv,86087877,https://dagshub.com/api/v1/repos/informal2form...,text/plain,1221781


In [93]:
df = pd.read_csv(ds.head().dataframe["dagshub_download_url"].values[0])
df["split"].value_counts()

split
train    2450
test      700
val       350
Name: count, dtype: int64