In [22]:
! sudo pip install snowflake-ml-python -U

Collecting snowflake-ml-python
[?25l  Downloading https://files.pythonhosted.org/packages/05/1f/738b02586c868aed616def737786b5849fe7b7475cfc6e3ee2f1c6badacc/snowflake_ml_python-1.0.8-py3-none-any.whl (1.7MB)
[K     |████████████████████████████████| 1.7MB 7.6MB/s eta 0:00:01
[?25hCollecting s3fs<2024,>=2022.11
  Downloading https://files.pythonhosted.org/packages/51/dc/ef0a84b2d7d03e042bdced0a5bab9cfee1e11a0080f010e43b37222784ce/s3fs-2023.9.1-py3-none-any.whl
Collecting pyyaml<7,>=6.0
[?25l  Downloading https://files.pythonhosted.org/packages/c8/6b/6600ac24725c7388255b2f5add93f91e58a5d7efaf4af244fdbcc11a541b/PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (736kB)
[K     |████████████████████████████████| 737kB 65.0MB/s eta 0:00:01
[?25hCollecting fsspec[http]<2024,>=2022.11
[?25l  Downloading https://files.pythonhosted.org/packages/6a/af/c673e8c663e17bd4fb201a6f029153ad5d7023aa4442d81c7987743db379/fsspec-2023.9.1-py3-none-any.whl (173kB)
[K     |██████████

Collecting pyarrow<10.1.0,>=10.0.1; extra == "pandas"
[?25l  Downloading https://files.pythonhosted.org/packages/89/b4/04ae9d39130d0dc40803eb6fbe84873c247f9c8e8111ac9b2cb30c35b515/pyarrow-10.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.0MB)
[K     |████████████████████████████████| 36.0MB 84.1MB/s eta 0:00:01
Collecting joblib>=1.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/10/40/d551139c85db202f1f384ba8bcf96aca2f329440a844f924c8a0040b6d02/joblib-1.3.2-py3-none-any.whl (302kB)
[K     |████████████████████████████████| 307kB 109.7MB/s eta 0:00:01
[?25hCollecting threadpoolctl>=2.0.0
  Downloading https://files.pythonhosted.org/packages/81/12/fd4dea011af9d69e1cad05c75f3f7202cdcbeac9b712eea58ca779a72865/threadpoolctl-3.2.0-py3-none-any.whl
Collecting multidict<7.0,>=4.5
[?25l  Downloading https://files.pythonhosted.org/packages/fe/0c/8469202f8f4b0e65816f91c3febc4bda7316c995b59ecdf3b15c574f7a24/multidict-6.0.4-cp38-cp38-manylinux_2_17_x86_64.many

In [1]:
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.linear_model import SGDClassifier
from snowflake.ml.modeling.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.metrics import mean_absolute_percentage_error

In [2]:
from snowflake.snowpark import Session
import configparser
def get_session():
    parser = configparser.ConfigParser()
    # Add the credential file name here
    parser.read('/notebooks/notebooks/config.ini')

    connection_params = dict(user=parser['Credentials']['user'], 
                         password=parser['Credentials']['password'], 
                         account=parser['Credentials']['account'], 
                         warehouse=parser['Credentials']['warehouse'], 
                         database=parser['Credentials']['database'],
                         schema=parser['Credentials']['schema'], 
                         role=parser['Credentials']['role'])

    session = Session.builder.configs(connection_params).create()
    return session



In [3]:
session = get_session()

# Model Building
### Load the final data from snowflake table

In [4]:
table_name = 'predictive_maintenance_final'

In [5]:
sf_df = session.table(table_name).drop('ROW')

In [6]:
sf_df.show(3)

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"FAILURE"  |"METRIC5"  |"METRIC6"  |"DAY"  |"DAY_WEEK"  |"SECTOR"  |"OP_PERIOD"  |"DEV_RECONNECTED"  |"MNW1"    |"DIF_M6"  |"DIF_M5"  |"LOG_M2"           |"LOG_M3"            |"LOG_M4"            |"LOG_M7"  |"LOG_M9"           |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|0          |6          |407438     |1      |3           |S1F0      |1            |0                  |26953834  |0         |0         |2.079441541679836  |0.0                 |3.9702919135521215  |0.0       |2.079441541679836  |
|0          |6          |403174     |1      |3           |S1F0      |1          

In [8]:
a = sf_df.to_pandas()

In [11]:
list(a.columns)

['FAILURE',
 'METRIC5',
 'METRIC6',
 'DAY',
 'DAY_WEEK',
 'SECTOR',
 'OP_PERIOD',
 'DEV_RECONNECTED',
 'MNW1',
 'DIF_M6',
 'DIF_M5',
 'LOG_M2',
 'LOG_M3',
 'LOG_M4',
 'LOG_M7',
 'LOG_M9']

In [10]:
a.head()

Unnamed: 0,FAILURE,METRIC5,METRIC6,DAY,DAY_WEEK,SECTOR,OP_PERIOD,DEV_RECONNECTED,MNW1,DIF_M6,DIF_M5,LOG_M2,LOG_M3,LOG_M4,LOG_M7,LOG_M9
0,0,6,407438,1,3,S1F0,1,0,26953834,0,0,2.079442,0.0,3.970292,0.0,2.079442
1,0,6,403174,1,3,S1F0,1,0,7671335,0,0,0.0,1.386294,0.0,0.0,0.0
2,0,12,237394,1,3,S1F0,1,0,21661996,0,0,0.0,0.0,0.0,0.0,0.0
3,0,6,410186,1,3,S1F0,1,0,9961753,0,0,0.0,0.0,0.0,0.0,0.0
4,0,15,313173,1,3,S1F0,1,0,16996310,0,0,0.0,0.0,0.0,0.0,1.386294


In [7]:
CATEGORICAL_COLUMNS = ["SECTOR"]
CATEGORICAL_COLUMNS_OE = ["SECTOR_OE"]
NUMERICAL_COLUMNS = ['METRIC5','METRIC6','DAY','DAY_WEEK','OP_PERIOD','MNW1','DIF_M6','DIF_M5','LOG_M2',
 'LOG_M3','LOG_M4','LOG_M7','LOG_M9']
LABEL_COLUMNS = ["FAILURE"]
OUTPUT_COLUMNS = ["PREDICTION"]

In [8]:
train_df, test_df = sf_df.random_split([0.8,0.2])

In [9]:
train_df.show(3)

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"FAILURE"  |"METRIC5"  |"METRIC6"  |"DAY"  |"DAY_WEEK"  |"SECTOR"  |"OP_PERIOD"  |"DEV_RECONNECTED"  |"MNW1"    |"DIF_M6"  |"DIF_M5"  |"LOG_M2"  |"LOG_M3"            |"LOG_M4"  |"LOG_M7"  |"LOG_M9"  |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|0          |6          |403174     |1      |3           |S1F0      |1            |0                  |7671335   |0         |0         |0.0       |1.3862943611198906  |0.0       |0.0       |0.0       |
|0          |12         |237394     |1      |3           |S1F0      |1            |0                  |21661996  |0         |0         |0.0       |0.0                 |0.0       |0.0       |0.

Create a pipeline with preprocessing steps and model definition

In [10]:
pipeline = Pipeline(
    steps=[
            (
                "OHE",
                OneHotEncoder(
                input_cols=CATEGORICAL_COLUMNS,
                output_cols=CATEGORICAL_COLUMNS_OE
                )
            ),
#             (
#                 "MMS",
#                 MinMaxScaler(
#                 clip=True,
#                 input_cols=NUMERICAL_COLUMNS,
#                 output_cols=NUMERICAL_COLUMNS,
#                 )
#             ),
            (
                "classification",
                SGDClassifier(
                input_cols=NUMERICAL_COLUMNS,
                label_cols=LABEL_COLUMNS,
                output_cols=OUTPUT_COLUMNS
                )
            )
    ]
)

In [11]:
pipeline.fit(train_df)

  success, nchunks, nrows, ci_output = write_pandas(
  success, nchunks, nrows, ci_output = write_pandas(
The version of package 'snowflake-snowpark-python' in the local environment is 1.8.0, which does not fit the criteria for the requirement 'snowflake-snowpark-python'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'numpy' in the local environment is 1.24.4, which does not fit the criteria for the requirement 'numpy==1.24.3'. Your UDF might not work when the package version is different between the server and your local environment.


SnowparkSQLException: (1300) (1304): 01af1b66-0503-cace-0072-f3030bf43e62: 100357 (P0000): Python Interpreter Error:
Traceback (most recent call last):
  File "_udf_code.py", line 90, in compute
  File "/tmp/pip_packages/snowflake/ml/modeling/_internal/snowpark_handlers.py", line 90, in fit_wrapper_sproc
  File "/usr/lib/python_udf/47551a7bfd00f4b7db3a8f702d9cb264485e91e604075ad17c0dfcb8e138b05a/lib/python3.8/site-packages/snowflake/snowpark/_internal/telemetry.py", line 76, in wrap
    result = func(*args, **kwargs)
  File "/usr/lib/python_udf/47551a7bfd00f4b7db3a8f702d9cb264485e91e604075ad17c0dfcb8e138b05a/lib/python3.8/site-packages/snowflake/snowpark/dataframe.py", line 492, in to_pandas
    result = self._session._conn.execute(self._plan, to_pandas=True, **kwargs)
  File "/usr/lib/python_udf/47551a7bfd00f4b7db3a8f702d9cb264485e91e604075ad17c0dfcb8e138b05a/lib/python3.8/site-packages/snowflake/snowpark/_internal/server_connection.py", line 354, in execute
    result_set, result_meta = self.get_result_set(
  File "/usr/lib/python_udf/47551a7bfd00f4b7db3a8f702d9cb264485e91e604075ad17c0dfcb8e138b05a/lib/python3.8/site-packages/snowflake/snowpark/_internal/analyzer/snowflake_plan.py", line 84, in wrap
    return func(*args, **kwargs)
  File "/usr/lib/python_udf/47551a7bfd00f4b7db3a8f702d9cb264485e91e604075ad17c0dfcb8e138b05a/lib/python3.8/site-packages/snowflake/snowpark/_internal/server_connection.py", line 390, in get_result_set
    result = self.run_query(
  File "/usr/lib/python_udf/47551a7bfd00f4b7db3a8f702d9cb264485e91e604075ad17c0dfcb8e138b05a/lib/python3.8/site-packages/snowflake/snowpark/_internal/server_connection.py", line 102, in wrap
    raise ex
  File "/usr/lib/python_udf/47551a7bfd00f4b7db3a8f702d9cb264485e91e604075ad17c0dfcb8e138b05a/lib/python3.8/site-packages/snowflake/snowpark/_internal/server_connection.py", line 95, in wrap
    return func(*args, **kwargs)
  File "/usr/lib/python_udf/47551a7bfd00f4b7db3a8f702d9cb264485e91e604075ad17c0dfcb8e138b05a/lib/python3.8/site-packages/snowflake/snowpark/_internal/server_connection.py", line 311, in run_query
    raise ex
  File "/usr/lib/python_udf/47551a7bfd00f4b7db3a8f702d9cb264485e91e604075ad17c0dfcb8e138b05a/lib/python3.8/site-packages/snowflake/snowpark/_internal/server_connection.py", line 304, in run_query
    results_cursor = self._cursor.execute(query, **kwargs)
TypeError: execute() got an unexpected keyword argument 'statement_params'
 in function SNOWPARK_TEMP_PROCEDURE_KWZI68PTG3 with handler compute

In [None]:
result = pipeline.predict(test_df)

In [7]:
sf_df1 = session.table("BANK_CUSTOMER_CHURN_PREDICTION").drop('ROW')

In [8]:
CATEGORICAL_COLUMNS = ["COUNTRY", "GENDER"]
CATEGORICAL_COLUMNS_OE = ["COUNTRY_OE", "GENDER_OE"]
NUMERICAL_COLUMNS = ["CREDIT_SCORE", "AGE", "TENURE", "BALANCE", "PRODUCTS_NUMBER", "ESTIMATED_SALARY"]
LABEL_COLUMNS = ["CHURN"]
OUTPUT_COLUMNS = ["PREDICTION"]

In [9]:
pipeline1 = Pipeline(
    steps=[
            (
                "OE",
                OrdinalEncoder(
                input_cols=CATEGORICAL_COLUMNS,
                output_cols=CATEGORICAL_COLUMNS_OE
                )
            ),
            (
                "MMS",
                MinMaxScaler(
                clip=True,
                input_cols=NUMERICAL_COLUMNS,
                output_cols=NUMERICAL_COLUMNS,
                )
            ),
            (
                "classification",
                XGBClassifier(
                input_cols=CATEGORICAL_COLUMNS_OE+NUMERICAL_COLUMNS + ["CREDIT_CARD"],
                label_cols=LABEL_COLUMNS,
                output_cols=OUTPUT_COLUMNS
                )
            )
    ]
)

In [10]:
train_df1, test_df1 = sf_df1.random_split([0.8,0.2])

In [11]:
pipeline1.fit(train_df1)

The version of package 'snowflake-snowpark-python' in the local environment is 1.8.0, which does not fit the criteria for the requirement 'snowflake-snowpark-python'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'xgboost' in the local environment is 1.7.6, which does not fit the criteria for the requirement 'xgboost==1.7.3'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'numpy' in the local environment is 1.24.4, which does not fit the criteria for the requirement 'numpy==1.24.3'. Your UDF might not work when the package version is different between the server and your local environment.


SnowparkSQLException: (1300) (1304): 01af1b6d-0503-cace-0072-f3030bf452aa: 100357 (P0000): Python Interpreter Error:
Traceback (most recent call last):
  File "_udf_code.py", line 90, in compute
  File "/tmp/pip_packages/snowflake/ml/modeling/_internal/snowpark_handlers.py", line 90, in fit_wrapper_sproc
  File "/usr/lib/python_udf/b225293ec8767520ec91e8f679e01dbef2d80eb92d0b90f1187699a09af30f8f/lib/python3.8/site-packages/snowflake/snowpark/_internal/telemetry.py", line 76, in wrap
    result = func(*args, **kwargs)
  File "/usr/lib/python_udf/b225293ec8767520ec91e8f679e01dbef2d80eb92d0b90f1187699a09af30f8f/lib/python3.8/site-packages/snowflake/snowpark/dataframe.py", line 492, in to_pandas
    result = self._session._conn.execute(self._plan, to_pandas=True, **kwargs)
  File "/usr/lib/python_udf/b225293ec8767520ec91e8f679e01dbef2d80eb92d0b90f1187699a09af30f8f/lib/python3.8/site-packages/snowflake/snowpark/_internal/server_connection.py", line 354, in execute
    result_set, result_meta = self.get_result_set(
  File "/usr/lib/python_udf/b225293ec8767520ec91e8f679e01dbef2d80eb92d0b90f1187699a09af30f8f/lib/python3.8/site-packages/snowflake/snowpark/_internal/analyzer/snowflake_plan.py", line 84, in wrap
    return func(*args, **kwargs)
  File "/usr/lib/python_udf/b225293ec8767520ec91e8f679e01dbef2d80eb92d0b90f1187699a09af30f8f/lib/python3.8/site-packages/snowflake/snowpark/_internal/server_connection.py", line 390, in get_result_set
    result = self.run_query(
  File "/usr/lib/python_udf/b225293ec8767520ec91e8f679e01dbef2d80eb92d0b90f1187699a09af30f8f/lib/python3.8/site-packages/snowflake/snowpark/_internal/server_connection.py", line 102, in wrap
    raise ex
  File "/usr/lib/python_udf/b225293ec8767520ec91e8f679e01dbef2d80eb92d0b90f1187699a09af30f8f/lib/python3.8/site-packages/snowflake/snowpark/_internal/server_connection.py", line 95, in wrap
    return func(*args, **kwargs)
  File "/usr/lib/python_udf/b225293ec8767520ec91e8f679e01dbef2d80eb92d0b90f1187699a09af30f8f/lib/python3.8/site-packages/snowflake/snowpark/_internal/server_connection.py", line 311, in run_query
    raise ex
  File "/usr/lib/python_udf/b225293ec8767520ec91e8f679e01dbef2d80eb92d0b90f1187699a09af30f8f/lib/python3.8/site-packages/snowflake/snowpark/_internal/server_connection.py", line 304, in run_query
    results_cursor = self._cursor.execute(query, **kwargs)
TypeError: execute() got an unexpected keyword argument 'statement_params'
 in function SNOWPARK_TEMP_PROCEDURE_5TEQENU1RA with handler compute