<header style="background:#00233C;padding-left:20pt;padding-right:20pt;padding-top:20pt;padding-bottom:10pt;"><img id="Teradata-logo" src="https://storage.googleapis.com/clearscape_analytics_data/Logo/teradata.svg" alt="Teradata" style="width: 100px; height: auto; margin-top: 20pt;" align="right">
<p style="font-size:20px; color:#ffffff">UDW INNOVATION DAYS</p>
<p style="font-size:24px; color:#ffffff">Logistic Regression</p>
</header>

### Use Case - Logistic Regression for Breast Cancer
1. Connect to Vantage
2. Split the data into Training/Scoring dataset
3. Create Logistic Regression model
4. Create the prediction with the trained model
5. Evaluate the forecast accuracy with the scoring dataset

#### Import teradataml package libraries

##### Install packages as needed
Note: You only need to run these once per package. The "!" allows you to run Linux script from the notebook cell. 

In [None]:
!pip install teradataml --user

In [1]:
# managing connection context
from teradataml import create_context, get_context, remove_context

# for setting configure options
from teradataml import configure

# for teradataml DataFrame object
from teradataml.dataframe.dataframe import DataFrame, in_schema

# for copying pandas dataframe to SQL table
from teradataml.dataframe.copy_to import copy_to_sql

# dataframe manipulation methods and sql data types
from teradatasqlalchemy.types import *
from sqlalchemy.sql.expression import select, and_, or_, not_, extract, text, join, case as case_when
from sqlalchemy import func, sql, distinct

# teradataml utils
from teradataml import configure, db_drop_table, UtilFuncs

# Vantage Analytics Library (valib)
from teradataml.analytics.valib import *
from teradataml.analytics import Transformations as tf 

#### Import other helpful open source packages

In [2]:
# Open source packages

# hide passwords
import getpass as gp

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# system
from os.path import exists
import yaml, sys
from datetime import datetime as dt, timedelta
import math

# dataframes and matrices
import pandas as pd
import numpy as np

%matplotlib inline

##### Configure Display Options

In [3]:
plt.rcdefaults()
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (5, 3)
plt.rcParams['font.size'] = 8

### Connection Variables

##### Set User and Password Variables

In [4]:
user = gp.getpass("User")

User ········


In [5]:
password = gp.getpass("Password")

Password ········


##### Set Connection Variables

In [9]:
host = 'UDWTest'
logmech = 'LDAP'
defaultDB = 'INOUDWTRAINING2024' 
val_database = 'val'

##### Create Context
See the PythonBasics-1-ConnectingToVantage Notebook for more information about contexts and garbage collection.  

In [12]:
td_context = create_context(host = host, 
                            username= user, 
                            password = password, 
                            logmech='LDAP', 
                            database=defaultDB)



#### Set Vantage Analytics Library (VAL) database location
`from teradataml import configure`

In [13]:
configure.val_install_location = val_database

## Logistic Regression

#### Create virtual DataFrame

Create a teradataml DataFrame object for the database table "breastcancer"

You only need to use `in_schema` if you are accessing non-default database table.

Example: `df = DataFrame(in_schema("some_other_db", "some_table"))`

In [14]:
src_df = DataFrame('breastcancer')
src_df.sample(5)

id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave_points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst,sampleid
88350402,B,13.64,15.6,87.38,575.3,0.09,0.07,0.05,0.04,0.17,0.06,0.32,0.66,2.0,27.19,0.01,0.01,0.02,0.01,0.02,0.0,14.85,19.05,94.11,683.4,0.13,0.13,0.15,0.09,0.25,0.07,1
842517,M,20.57,17.77,132.9,1326.0,0.08,0.08,0.09,0.07,0.18,0.06,0.54,0.73,3.4,74.08,0.01,0.01,0.02,0.01,0.01,0.0,24.99,23.41,158.8,1956.0,0.12,0.19,0.24,0.19,0.28,0.09,1
898678,B,12.06,18.9,76.66,445.3,0.08,0.06,0.01,0.01,0.16,0.06,0.24,1.15,1.56,18.02,0.01,0.01,0.01,0.01,0.02,0.0,13.64,27.06,86.54,562.6,0.13,0.14,0.05,0.05,0.29,0.08,1
89143602,B,14.41,19.73,96.03,651.0,0.09,0.17,0.14,0.07,0.17,0.07,0.88,1.77,4.36,77.11,0.01,0.11,0.1,0.03,0.04,0.02,15.77,22.13,101.7,767.3,0.1,0.25,0.22,0.1,0.23,0.09,1
917897,B,9.85,15.68,63.0,293.2,0.09,0.08,0.02,0.02,0.14,0.07,0.25,1.22,1.98,15.24,0.01,0.02,0.01,0.01,0.02,0.0,11.24,22.99,74.32,376.5,0.14,0.22,0.08,0.07,0.25,0.09,1


#### Showing the available columns and datatype

In [15]:
src_df.dtypes

COLUMN NAME,TYPE
id,int
diagnosis,str
radius_mean,float
texture_mean,float
perimeter_mean,float
area_mean,float
smoothness_mean,float
compactness_mean,float
concavity_mean,float
concave_points_mean,float


#### Define the dependent variable and independent variables

In [16]:
y = src_df.columns[1]
X = src_df.columns[2:]
X

['radius_mean',
 'texture_mean',
 'perimeter_mean',
 'area_mean',
 'smoothness_mean',
 'compactness_mean',
 'concavity_mean',
 'concave_points_mean',
 'symmetry_mean',
 'fractal_dimension_mean',
 'radius_se',
 'texture_se',
 'perimeter_se',
 'area_se',
 'smoothness_se',
 'compactness_se',
 'concavity_se',
 'concave_points_se',
 'symmetry_se',
 'fractal_dimension_se',
 'radius_worst',
 'texture_worst',
 'perimeter_worst',
 'area_worst',
 'smoothness_worst',
 'compactness_worst',
 'concavity_worst',
 'concave_points_worst',
 'symmetry_worst',
 'fractal_dimension_worst']

#### One-hot transformation for the dependent variable
- M --> 1
- B --> 0

In [17]:
rc_1 = tf.LabelEncoder(values={"B":0, "M":1}, default="SAME", columns=y, out_columns=y)
rt = Retain(columns=X)
src_obj = valib.Transform(data = src_df, 
                          label_encode=[rc_1],
                          retain = rt,
                          key_columns=["id"])
src_trf_df = src_obj.result

#### Split the data into Training and Scoring dataset

In [18]:
train_ratio = 0.75

sample_df = src_trf_df.sample(frac = [train_ratio, 1.0-train_ratio], randomize = True)

train_df = sample_df[sample_df.sampleid == 1]
test_df = sample_df[sample_df.sampleid == 2]

#### Showing the data distribution for Training data

In [19]:
train_df.groupby(y).assign(cnt = func.count(0))

diagnosis,cnt
1,158
0,269


#### Showing the data distribution for Scoring data

In [20]:
test_df.groupby(y).assign(cnt = func.count(0))

diagnosis,cnt
1,60
0,82


#### Save the training and scoring dataset as new tables

In [21]:
copy_to_sql(train_df.select(['id',y]+X), 
            f"{user}_breastcancer_reg_train", schema_name=defaultDB, 
            if_exists='replace', 
            primary_index="id", 
            set_table=False)

copy_to_sql(test_df.select(['id',y]+X), 
            f"{user}_breastcancer_reg_test", schema_name=defaultDB, 
            if_exists='replace', 
            primary_index="id", 
            set_table=False)


#### Train a Logistic Regression model with the training dataset
    DESCRIPTION:
        Linear Regression is one of the fundamental types of predictive modeling algorithms.
        In linear regression, a dependent numeric variable is expressed in terms of the sum
        of one or more independent numeric variables, which are each multiplied by a numeric
        coefficient, usually with a constant term added to the sum of independent variables.
        It is the coefficients of the independent variables together with a constant term
        that comprise a linear regression model. Applying these coefficients to the variables
        (columns) of each observation (row) in a data set is known as scoring, as described
        in Linear Regression Scoring.

In [22]:
train_df = DataFrame(f"{user}_breastcancer_reg_train")
model_obj = valib.LogReg(data=train_df,
                         columns=X,
                         response_column = y,
                         response_value=1,
                         constant = True,
                         stepwise = False
                        )

#### Display the model and the statistics

In [23]:
model_obj.model.head(100)

Column Name,B Coefficient,Standard Error,Wald Statistic,T Statistic,P-Value,Odds Ratio,Lower,Upper,Partial R,Standardized Coefficient
area_se,-0.597130295661717,533.493631487136,1.25279419629843e-06,-0.0011192828937754,0.999107505252085,0.550388825177831,0.0,,0.0,-13.9913592124895
compactness_mean,755.859114720794,61538.8953372851,0.0001508628530908,0.0122826240311603,0.990206315774345,,0.0,,0.0,21.9424757776659
compactness_se,-2559.66494064889,72057.7703375301,0.0012618409013764,-0.0355223999946017,0.971681083802349,0.0,0.0,,0.0,-25.5163385042966
compactness_worst,-162.571646635766,23360.0369247491,4.84331325658861e-05,-0.0069593916807351,0.994450758171437,2.48903470542895e-71,0.0,,0.0,-13.5957765257569
concave_points_se,5846.87346462789,288586.08391495,0.0004104843656046,0.0202604137569954,0.983845838880249,,0.0,,0.0,20.9253091363379
concave_points_worst,-335.78172839684,70498.991523204,2.26854976855527e-05,-0.0047629295276702,0.996202144861386,1.48541646432142e-146,0.0,,0.0,-11.9753834733322
concavity_mean,1117.12903864359,78279.9326651082,0.0002036600261449,0.0142709504289282,0.988621002417784,,0.0,,0.0,48.3539155918281
concavity_se,-1732.37908873006,76990.8768349593,0.0005062993083082,-0.0225010957135046,0.982059571385843,0.0,0.0,,0.0,-28.1349260535396
concavity_worst,163.606971522641,33705.2117988675,2.35618556069292e-05,0.0048540555834198,0.996129483518648,1.13137071537571e+71,0.0,,0.0,17.866052291941
fractal_dimension_mean,-1700.34972759272,202333.619400096,7.06220648912267e-05,-0.0084036935267313,0.99329913347921,0.0,0.0,,0.0,-6.98352868630673


In [24]:
model_obj.statistical_measures

rid,Total Observations,Total Iterations,Initial Log Likelihood,Final Log Likelihood,Likelihood Ratio Test G Statistic,Chi-Square Degrees of Freedom,Chi-Square Value,Chi-Square Probability,McFaddens Pseudo R-Squared,Dependent Variable,Dependent Response Value,Total Distinct Values
1,427.0,19.0,-282.423571418344,-0.000388332539852,564.846366171608,30.0,43.7729718256873,0.0,0.999998624999543,diagnosis,1.0,2.0


#### Create the prediction from the trained Logistic Regression model using the scoring dataset
    DESCRIPTION:
        Linear Regression Scoring is the application of a Linear Regression model to an input
        data that contains the same independent variable columns contained in the model. The
        result is an output score data that minimally contains one or more key columns and
        an estimate of the dependent variable in the model.

In [25]:
test_df = DataFrame(f"{user}_breastcancer_reg_test")
pred_obj = valib.LogRegPredict(data=test_df,
                               model=model_obj.model,
                               prob_column="Probability",
                               accumulate=y
                              )

In [26]:
pred_obj.result

id,diagnosis,Probability
91903902,0,0.0
89511501,0,0.0
87127,0,0.0
912558,0,0.0
923780,0,0.0
866714,0,0.0
884437,0,3.900161178062741e-12
857438,1,1.0
858477,0,0.0
903507,1,1.0


#### Calculate the confusion matrix

In [None]:
alpha = 0.5
result_df = pred_obj.result.assign(True,
                                   Actual = pred_obj.result[y],
                                   Pred_0 = case_when([( pred_obj.result.Probability.expression < alpha, 1)], else_=0),
                                   Pred_1 = case_when([( pred_obj.result.Probability.expression >= (1.0- alpha), 1)], else_=0)
                                  )

In [None]:

confusion_maxtrix = result_df.groupby("Actual").sum().to_pandas().sort_values("Actual")
confusion_maxtrix

#### Calculate Accuracy, Precision, Recall and F1

In [None]:
true_neg = confusion_maxtrix.loc[confusion_maxtrix["Actual"]=='0']["sum_Pred_0"].values[0]
true_pos =confusion_maxtrix.loc[confusion_maxtrix["Actual"]=='1']["sum_Pred_1"].values[0]
false_pos =confusion_maxtrix.loc[confusion_maxtrix["Actual"]=='0']["sum_Pred_1"].values[0]
false_neg =confusion_maxtrix.loc[confusion_maxtrix["Actual"]=='1']["sum_Pred_0"].values[0]
total_rec = confusion_maxtrix.iloc[:,1:4].sum().sum()

Accuracy = (true_neg+true_pos)/total_rec
Precision = true_pos/(true_pos+false_pos)
Recall  = true_pos/(true_pos+false_neg)
F1 = 2.0 * (Precision*Recall)/(Precision+Recall)

print("*** Forecast Accuracy ***")
print(f"Accuracy\t{np.round(Accuracy * 100.0,5)}")
print(f"Precision\t{np.round(Precision * 100.0,5)}")
print(f"Recall\t\t{np.round(Recall * 100.0,5)}")
print(f"F1 Score\t{np.round(F1 * 100.0,5)}")


#### Cleanup Tables

In [None]:
try:
    db_drop_table(f'{user}_breastcancer_reg_train')
except: 
    pass

try:
    db_drop_table(f'{user}_breastcancer_reg_test')
except: 
    pass

##### Disconnect from Vantage

In [None]:
# One must run remove_context() to close the connection and garbage collect internally generated objects.
remove_context()

<span style="font-size:16px;">For online documentation on Teradata Vantage analytic functions, refer to the [Teradata Developer Portal](https://docs.teradata.com/) and search for phrases "Python User Guide" and "Python Function Reference".</span>