In [2]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=20)
mpl.rc('xtick', labelsize=16)
mpl.rc('ytick', labelsize=16)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [3]:
import tarfile
import urllib
import pandas as pd
import urllib.request

In [4]:
data = pd.read_csv("https://raw.githubusercontent.com/BenCoke12/DMMLG3/main/Breast_Cancer.csv")

In [5]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit(data[["Race", "Marital Status", "T Stage ", "N Stage", "6th Stage", "differentiate", "Grade", "A Stage", "Estrogen Status", "Progesterone Status", "Status"]])
data[["Race", "Marital Status", "T Stage ", "N Stage", "6th Stage", "differentiate", "Grade", "A Stage", "Estrogen Status", "Progesterone Status", "Status"]] = enc.transform(data[["Race", "Marital Status", "T Stage ", "N Stage", "6th Stage", "differentiate", "Grade", "A Stage", "Estrogen Status", "Progesterone Status", "Status"]])

In [6]:
X=data.iloc[:, -1].values
#X=data

In [7]:
X.shape

(4024,)

In [8]:
Y=data.iloc[:,5]
#Y=data['6th Stage']

In [9]:
X = data.drop('6th Stage', axis = 1)

In [10]:
Y

0       0.0
1       2.0
2       4.0
3       0.0
4       1.0
       ... 
4019    0.0
4020    2.0
4021    1.0
4022    1.0
4023    1.0
Name: 6th Stage, Length: 4024, dtype: float64

# **Linear Regression**

In [11]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X, Y) #train linear regression on X and y
lin_reg.intercept_, lin_reg.coef_ # learning paramete

(0.1532715738597632,
 array([ 7.47709853e-04, -7.31235387e-03, -1.29318983e-02,  7.19816860e-01,
         1.25938348e+00,  3.68077677e-03,  2.66567965e-02,  1.03511300e-02,
        -3.29849068e-03,  7.87177678e-03, -2.35394018e-02,  6.88622673e-04,
         2.52844954e-02, -3.39924610e-04, -3.72880257e-02]))

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=42)

In [14]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [15]:
X_train.shape

(2816, 15)

In [16]:
from sklearn.linear_model import LinearRegression

In [17]:
lr=LinearRegression()

In [18]:
lr.fit(X_train,Y_train)

LinearRegression()

In [19]:
y_pred=lr.predict(X_test)

In [20]:
y_pred

array([0.18430119, 0.17949237, 2.27254092, ..., 1.48357644, 0.17201951,
       0.77620842])

# **Mean Squared Error**

In [21]:
from sklearn.metrics import mean_squared_error
tree_mse = mean_squared_error(Y_test, y_pred)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.3505847512757797

# **Mean Absolute Error**

In [22]:
from sklearn.metrics import mean_absolute_error

tree_mae = mean_absolute_error(Y_test, y_pred)
tree_mae

0.2735340117442727

# **Logistic Regression**

In [23]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train, Y_train) #train linear regression on X and y


LogisticRegression()

In [24]:
log_reg.intercept_, lin_reg.coef_ # learning paramete

(array([-0.93752507,  4.45176145,  5.34154713, -5.0712008 , -3.78458271]),
 array([ 7.47709853e-04, -7.31235387e-03, -1.29318983e-02,  7.19816860e-01,
         1.25938348e+00,  3.68077677e-03,  2.66567965e-02,  1.03511300e-02,
        -3.29849068e-03,  7.87177678e-03, -2.35394018e-02,  6.88622673e-04,
         2.52844954e-02, -3.39924610e-04, -3.72880257e-02]))

In [25]:
log_reg.coef_ 

array([[-1.01206120e-02, -2.72365647e-02, -1.79801647e-02,
        -7.53371753e+00, -7.36089186e+00,  1.01635513e-03,
        -4.31378712e-02,  2.27854368e-03, -2.16511653e+00,
         2.82246150e-02,  5.63987272e-02, -1.02685312e-01,
        -2.20971525e+00,  3.42318859e-02, -1.40346374e-01],
       [ 2.03521537e-02, -3.48824071e-02, -5.13615604e-03,
        -1.03642496e+00, -5.24955833e+00, -1.68694080e-02,
         3.42657904e-02,  1.04024549e-01, -8.63491843e-01,
         5.36148006e-03, -9.24414081e-03, -5.61108098e-02,
        -4.72045897e-01, -1.84688865e-03, -1.74665411e-02],
       [ 6.21107596e-02,  8.47094028e-02,  1.23446343e-02,
         1.26859106e+00,  2.94257985e+00,  1.41542772e-02,
        -2.49274999e-02,  4.37917456e-01,  1.21212984e+00,
         9.55213146e-03,  1.32783069e-03, -2.03871550e-02,
         1.50550639e-01, -7.04918333e-02, -7.09393329e-02],
       [-2.11972585e-02, -7.54115781e-02,  1.22779491e-02,
         5.84488479e+00,  2.02542838e+00,  8.84214782

In [26]:
log_reg=LogisticRegression()

In [27]:
log_reg.fit(X_train,Y_train)

LogisticRegression()

In [28]:
y_pred=log_reg.predict(X_test)

In [29]:
y_pred

array([0., 0., 2., ..., 2., 0., 1.])

# **Mean Squared Error**

In [30]:
from sklearn.metrics import mean_squared_error
tree_mse = mean_squared_error(Y_test, y_pred)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0575435337648436

# **Mean Absolute Error**

In [31]:
from sklearn.metrics import mean_absolute_error

tree_mae = mean_absolute_error(Y_test, y_pred)
tree_mae

0.0033112582781456954

In [32]:
theta_best_svd, residuals, rank, s = np.linalg.lstsq(X_train, Y_train, rcond=1e-6)
theta_best_svd # Least-squares solution

array([ 1.07637535e-02, -1.07470567e-03, -9.90666729e-03,  5.61895153e-01,
        8.73027785e-01,  4.76838987e-04,  1.54222429e-02,  6.58020801e-03,
       -7.19869988e-02, -7.65214424e-04, -1.46444691e-02,  8.06809836e-04,
        1.48113420e-01, -6.26661657e-03, -1.81448255e-02])

# **Linear Regression with learning parameters and batch size**: 

In [33]:
def get_full_sample_matrix(samples):
    samples_matrix = samples.copy()
    if samples.ndim == 1:
        samples_matrix = samples_matrix.reshape(-1, 1)
        
    ones_vec = np.ones((samples_matrix.shape[0], 1), dtype=samples.dtype)
    return np.hstack([ones_vec, samples_matrix])

In [34]:
def grad_desc(X, Y, rate = 0.001, iterations = 100):#learning rate and batch size
    w = np.zeros((X.shape[1], 1))
    for _ in range(iterations):
        errors = Y - X.dot(w)
        grad = -(X.T).dot(errors)
        w = w - rate*grad
    return w

In [35]:
w=grad_desc(X,Y)

In [36]:
w

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4014,4015,4016,4017,4018,4019,4020,4021,4022,4023
Age,0.0,,,,,,,,,,...,,,,,,,,,,
Race,0.0,,,,,,,,,,...,,,,,,,,,,
Marital Status,0.0,,,,,,,,,,...,,,,,,,,,,
T Stage,0.0,,,,,,,,,,...,,,,,,,,,,
N Stage,0.0,,,,,,,,,,...,,,,,,,,,,
differentiate,0.0,,,,,,,,,,...,,,,,,,,,,
Grade,0.0,,,,,,,,,,...,,,,,,,,,,
A Stage,0.0,,,,,,,,,,...,,,,,,,,,,
Tumor Size,0.0,,,,,,,,,,...,,,,,,,,,,
Estrogen Status,0.0,,,,,,,,,,...,,,,,,,,,,


# **SGD Regressor**

In [37]:
from sklearn.linear_model import SGDRegressor

In [38]:
sgdr = SGDRegressor(max_iter=1000).fit(X_train, Y_train)

In [39]:
sgdr.score(X_test,Y_test)

0.920729599938158

In [40]:
mean_squared_error(Y_train, sgdr.predict(X_train))

0.12068974376949786

In [41]:
mean_squared_error(Y_test, sgdr.predict(X_test))

0.12294260713946333