# Data science in Microsoft Fabric


In [10]:
# Azure storage access info for open dataset diabetes
blob_account_name = "azureopendatastorage"
blob_container_name = "mlsamples"
blob_relative_path = "diabetes"
blob_sas_token = r"" # Blank since container is Anonymous access
    
# Set Spark config to access  blob storage
wasbs_path = f"wasbs://%s@%s.blob.core.windows.net/%s" % (blob_container_name, blob_account_name, blob_relative_path)
spark.conf.set("fs.azure.sas.%s.%s.blob.core.windows.net" % (blob_container_name, blob_account_name), blob_sas_token)
print("Remote blob path: " + wasbs_path)
    
# Spark read parquet, note that it won't load any data yet by now
df = spark.read.parquet(wasbs_path)


StatementMeta(, b900b596-4f7c-4228-b743-df7a01dad183, 13, Finished, Available)

Remote blob path: wasbs://mlsamples@azureopendatastorage.blob.core.windows.net/diabetes


In [11]:
display(df.limit(5))

StatementMeta(, b900b596-4f7c-4228-b743-df7a01dad183, 14, Finished, Available)

SynapseWidget(Synapse.DataFrame, 256a0120-0191-4ec0-bea0-3c90ed80e3db)

In [12]:
df = df.toPandas()
df.head()

StatementMeta(, b900b596-4f7c-4228-b743-df7a01dad183, 15, Finished, Available)

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y
0,59,2,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151
1,48,1,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75
2,72,2,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141
3,24,1,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206
4,50,1,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135


In [13]:
# Code generated by Data Wrangler for pandas DataFrame

def clean_data(df):
    # Created column 'Risk' from formula
    df['Risk'] = (df['Y'] > 211.5).astype(int)
    return df

df_clean = clean_data(df.copy())
df_clean.head()

StatementMeta(, b900b596-4f7c-4228-b743-df7a01dad183, 16, Finished, Available)

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y,Risk
0,59,2,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151,0
1,48,1,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75,0
2,72,2,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141,0
3,24,1,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206,0
4,50,1,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135,0


In [14]:
df_clean.describe()

StatementMeta(, b900b596-4f7c-4228-b743-df7a01dad183, 17, Finished, Available)

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y,Risk
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,48.5181,1.468326,26.375792,94.647014,189.140271,115.43914,49.788462,4.070249,4.641411,91.260181,152.133484,0.251131
std,13.109028,0.499561,4.418122,13.831283,34.608052,30.413081,12.934202,1.29045,0.522391,11.496335,77.093005,0.434155
min,19.0,1.0,18.0,62.0,97.0,41.6,22.0,2.0,3.2581,58.0,25.0,0.0
25%,38.25,1.0,23.2,84.0,164.25,96.05,40.25,3.0,4.2767,83.25,87.0,0.0
50%,50.0,1.0,25.7,93.0,186.0,113.0,48.0,4.0,4.62005,91.0,140.5,0.0
75%,59.0,2.0,29.275,105.0,209.75,134.5,57.75,5.0,4.9972,98.0,211.5,0.75
max,79.0,2.0,42.2,133.0,301.0,242.4,99.0,9.09,6.107,124.0,346.0,1.0


In [15]:
from sklearn.model_selection import train_test_split
    
X, y = df_clean[['AGE','SEX','BMI','BP','S1','S2','S3','S4','S5','S6']].values, df_clean['Y'].values
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

StatementMeta(, b900b596-4f7c-4228-b743-df7a01dad183, 18, Finished, Available)

In [16]:
import mlflow
experiment_name = "diabetes-regression"
mlflow.set_experiment(experiment_name)

StatementMeta(, b900b596-4f7c-4228-b743-df7a01dad183, 19, Finished, Available)

<Experiment: artifact_location='', creation_time=1701704513347, experiment_id='c6c4f5f2-8821-44a9-8349-4c27a866fe13', last_update_time=None, lifecycle_stage='active', name='diabetes-regression', tags={}>

In [17]:
from sklearn.linear_model import LinearRegression
    
with mlflow.start_run():
   mlflow.autolog()
    
   model = LinearRegression()
   model.fit(X_train, y_train)

StatementMeta(, b900b596-4f7c-4228-b743-df7a01dad183, 20, Finished, Available)

2023/12/04 15:53:30 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [18]:
from sklearn.model_selection import train_test_split
    
X, y = df_clean[['AGE','SEX','BMI','BP','S1','S2','S3','S4','S5','S6']].values, df_clean['Risk'].values
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

StatementMeta(, b900b596-4f7c-4228-b743-df7a01dad183, 21, Finished, Available)

In [21]:
y_train

StatementMeta(, b900b596-4f7c-4228-b743-df7a01dad183, 24, Finished, Available)

array([1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,

In [22]:
import mlflow
experiment_name = "diabetes-classification"
mlflow.set_experiment(experiment_name)

StatementMeta(, b900b596-4f7c-4228-b743-df7a01dad183, 25, Finished, Available)

2023/12/04 16:01:37 INFO mlflow.tracking.fluent: Experiment with name 'diabetes-classification' does not exist. Creating a new experiment.


<Experiment: artifact_location='', creation_time=1701705699208, experiment_id='7b2cc115-c318-40b1-8f7c-718db00dfbb2', last_update_time=None, lifecycle_stage='active', name='diabetes-classification', tags={}>

In [23]:
from sklearn.linear_model import LogisticRegression
    
with mlflow.start_run():
    mlflow.sklearn.autolog()

    model = LogisticRegression(C=1/0.1, solver="liblinear").fit(X_train, y_train)

StatementMeta(, b900b596-4f7c-4228-b743-df7a01dad183, 26, Finished, Available)

