In [29]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_validate, ShuffleSplit
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.utils import shuffle
from sklearn.dummy import DummyRegressor
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,roc_curve, roc_auc_score,log_loss, classification_report,r2_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [None]:
os.environ['KAGGLE_USERNAME'] = 'vardhan.chebrolu'
os.environ['KAGGLE_KEY'] = '7ef605fc8dba5425d6965fbd4c8fbe1f'
! kaggle datasets download -d uciml/breast-cancer-wisconsin-data
! unzip breast-cancer-wisconsin-data.zip

Dataset URL: https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data
License(s): CC-BY-NC-SA-4.0
breast-cancer-wisconsin-data.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  breast-cancer-wisconsin-data.zip
replace data.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
df=pd.read_csv('/content/data.csv')
df.head()

In [None]:
df.isna()

In [None]:
df['diagnosis'].value_counts()

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer

X = df.drop(['diagnosis', 'id', 'Unnamed: 32'], axis=1)
y = df['diagnosis']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

numerical_features = X.columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'
)

features_scaled = preprocessor.fit_transform(X)

print("Shape of original features:", X.shape)
print("Shape after preprocessing:", features_scaled.shape)

In [None]:
ng = np.random.default_rng(7)
dataset = pd.read_csv('/content/data.csv')
dataset.head()

In [None]:
feature_columns = ['radius_mean', 'area_mean', 'area_worst']
x = dataset[feature_columns]
y = dataset['diagnosis']
x, y = shuffle(x, y, random_state=42)
features = dataset[feature_columns]
labels = dataset['diagnosis']


In [None]:
features, labels = shuffle(features, labels, random_state=42)
train_features,test_features,train_labels,test_labels=train_test_split(features,labels,test_size=0.2,random_state=42)
shuffle_split_cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

In [None]:
log_reg=LogisticRegression(max_iter=1000,random_state=42)
log_reg_pipeline = Pipeline([("feature_scaling", StandardScaler()), ("logistic_regression", LogisticRegression())])
log_reg_cv_results = cross_validate(log_reg_pipeline, train_features, train_labels, cv=shuffle_split_cv, scoring="accuracy")
log_reg_scores = pd.Series(log_reg_cv_results['test_score'], name="log_reg_accuracy")
log_reg_pipeline.fit(train_features, train_labels)
labels_pred = log_reg_pipeline.predict(test_features)
train_accuracy = accuracy_score(train_labels, log_reg_pipeline.predict(train_features))
test_accuracy = accuracy_score(test_labels, labels_pred)

find_accu= accuracy_score(test_labels,labels_pred)
find_prec= precision_score(test_labels,labels_pred, pos_label='M')
find_recall= recall_score(test_labels,labels_pred, pos_label='M')
find_f1= f1_score(test_labels,labels_pred, pos_label='M')
print("accuracy",find_accu)
print("precision",find_prec)
print("recall",find_recall)
print("f1",find_f1)
print("")

print("trained accuracy",train_accuracy)
print("test accuracy",test_accuracy)
print("")
if train_accuracy>test_accuracy+0.05:
  print("Overfitting")
elif train_accuracy<0.7 and test_accuracy<0.7:
  print("underfitting")
else:
  print("model is good")

In [None]:
lin_reg = LinearRegression()
train_features, test_features, train_labels_encoded, test_labels_encoded = train_test_split(features, y_encoded, test_size=0.2, random_state=42)
lin_reg_pipeline = Pipeline([("linear_regression", LinearRegression())])
lin_reg_cv_results = cross_validate(lin_reg_pipeline, train_features, train_labels_encoded, cv=shuffle_split_cv, scoring="r2")
lin_reg_scores = pd.Series(lin_reg_cv_results['test_score'], name="lin_reg_r2")
lin_reg_pipeline.fit(train_features, train_labels_encoded)
labels_pred_encoded = lin_reg_pipeline.predict(test_features)
train_r2 = r2_score(train_labels_encoded, lin_reg_pipeline.predict(train_features))
test_r2 = r2_score(test_labels_encoded, labels_pred_encoded)
print("train R2:", train_r2)
print("test R2:", test_r2)
print("")
if train_r2 < 0.7 and test_r2 < 0.7:
  print("underfitting")
else:
  print("model fit is moderate to good")

In [None]:
selected_features = ['radius_mean', 'area_mean', 'area_worst']
X = x[selected_features]
y = y
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("Classes:", le.classes_)
print("Encoded target (first 10):", y_encoded[:10])


In [None]:
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

model = LogisticRegression(max_iter=1000, solver='lbfgs')
model.fit(features_scaled, y)

probs = model.predict_proba(features_scaled)[:, 1]
loss = log_loss(y, probs)
final_weights = model.coef_[0]
final_bias = model.intercept_[0]

print("Forward Pass (first 5 probs):", probs[:5])
print("Backward Pass (final loss):", loss)
print("Final Weights shape:", final_weights.shape)
print("Final Bias:", final_bias)


In [None]:
loss=log_loss(test_labels,labels_pred_encoded)
print("loss",loss)
r2=r2_score(test_labels_encoded,labels_pred_encoded)
print("r2",r2)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# --- Simple Linear Regression ---
lin_reg = LinearRegression()

# Fit the model
lin_reg.fit(train_features, train_labels_encoded)

# Predict on train and test data
train_pred = lin_reg.predict(train_features)
test_pred = lin_reg.predict(test_features)

# Evaluate performance
train_r2 = r2_score(train_labels_encoded, train_pred)
test_r2 = r2_score(test_labels_encoded, test_pred)

print("Simple Linear Regression Results")
print("Train R²:", train_r2)
print("Test R²:", test_r2)
print("Train MSE:", mean_squared_error(train_labels_encoded, train_pred))
print("Test MSE:", mean_squared_error(test_labels_encoded, test_pred))

# Check for underfitting/overfitting
if train_r2 < 0.7 and test_r2 < 0.7:
    print("Model is Underfitting")
elif abs(train_r2 - test_r2) > 0.2:
    print("Model may be Overfitting")
else:
    print("Model fit is good")

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# Shuffle and split
features, labels = shuffle(features, labels, random_state=42)
train_features, test_features, train_labels_encoded, test_labels_encoded = train_test_split(
    features, y_encoded, test_size=0.2, random_state=42
)

# --- Multi Linear Regression ---
multi_lin_reg = LinearRegression()
multi_lin_reg.fit(train_features, train_labels_encoded)

# Predictions
train_pred = multi_lin_reg.predict(train_features)
test_pred = multi_lin_reg.predict(test_features)

# Evaluation
train_r2 = r2_score(train_labels_encoded, train_pred)
test_r2 = r2_score(test_labels_encoded, test_pred)

print("Multi Linear Regression Results")
print("Train R²:", train_r2)
print("Test R²:", test_r2)
print("Train MSE:", mean_squared_error(train_labels_encoded, train_pred))
print("Test MSE:", mean_squared_error(test_labels_encoded, test_pred))

# Check for underfitting / overfitting
if train_r2 < 0.7 and test_r2 < 0.7:
    print("Model is Underfitting")
elif abs(train_r2 - test_r2) > 0.2:
    print("Model may be Overfitting")
else:
    print("Model fit is good")

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# --- Polynomial Linear Regression ---
# Degree of polynomial (you can tune this, e.g., 2 or 3)
poly = PolynomialFeatures(degree=2, include_bias=False)

# Transform features into polynomial form
train_features_poly = poly.fit_transform(train_features)
test_features_poly = poly.transform(test_features)

# Train Linear Regression on polynomial features
poly_reg = LinearRegression()
poly_reg.fit(train_features_poly, train_labels_encoded)

# Predictions
train_pred_poly = poly_reg.predict(train_features_poly)
test_pred_poly = poly_reg.predict(test_features_poly)

# Evaluate performance
train_r2_poly = r2_score(train_labels_encoded, train_pred_poly)
test_r2_poly = r2_score(test_labels_encoded, test_pred_poly)

print("Polynomial Linear Regression Results (degree=2)")
print("Train R²:", train_r2_poly)
print("Test R²:", test_r2_poly)
print("Train MSE:", mean_squared_error(train_labels_encoded, train_pred_poly))
print("Test MSE:", mean_squared_error(test_labels_encoded, test_pred_poly))

# Overfitting / underfitting check
if train_r2_poly < 0.7 and test_r2_poly < 0.7:
    print("Model is Underfitting")
elif abs(train_r2_poly - test_r2_poly) > 0.2:
    print("Model may be Overfitting")
else:
    print("Model fit is good")

In [None]:
if 'id' in dataset.columns:
    data = dataset.drop(columns=['id'])

# Encode target: Malignant=1, Benign=0
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})

# Features and target
X = data.drop(columns=['diagnosis'])
y = data['diagnosis']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Try with different numbers of features (d)
for d in range(1, 6):  # d = 1 to 5
    selected_features = X_train.iloc[:, :d]   # take first d features
    model = LinearRegression()
    model.fit(selected_features, y_train)
    y_pred = model.predict(X_test.iloc[:, :d])
    r2 = r2_score(y_test, y_pred)
    print(f"d={d}, R2 Score={r2:.4f}")

In [None]:
if 'id' in data.columns:
    data = data.drop(columns=['id'])

# Encode target: Malignant=1, Benign=0
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})

# Features and labels
features = data.drop(columns=['diagnosis'])
labels = data['diagnosis']

# Encode labels if needed
le = LabelEncoder()
y_encoded = le.fit_transform(labels)

# Shuffle dataset
features, labels = shuffle(features, labels, random_state=42)

# Train-test split
train_features, test_features, train_labels_encoded, test_labels_encoded = train_test_split(
    features, y_encoded, test_size=0.2, random_state=42
)

# Loop for d = 1 to 5
for d in range(1, 6):
    # Select first d features
    train_d = train_features.iloc[:, :d]
    test_d = test_features.iloc[:, :d]

    # Train linear regression
    model = LinearRegression()
    model.fit(train_d, train_labels_encoded)
    y_pred = model.predict(test_d)

    # R² score
    r2 = r2_score(test_labels_encoded, y_pred)

    # Eigenvalues & condition index
    cov_matrix = np.cov(train_d, rowvar=False)
    eigenvalues, _ = np.linalg.eig(cov_matrix)
    eigenvalues = np.real(eigenvalues)  # handle tiny imaginary parts

    max_eig = np.max(eigenvalues)
    min_eig = np.min(eigenvalues[eigenvalues > 0]) if np.any(eigenvalues > 0) else 1e-10
    condition_index = np.sqrt(max_eig / min_eig)

    print(f"d={d}, R²={r2:.4f}, λ={eigenvalues.round(2)}, η={condition_index:.2f}")