In [None]:
#Regression with Evaluation metrics

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = pd.read_csv('D:\\AppStoneLab\\Day 2\\housing.csv', header=None, delimiter=r"\s+", names=column_names)

X=data.drop('MEDV', axis=1)
y=data['MEDV']

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.25, random_state=100)

LR=LinearRegression()
LR.fit(X_train, y_train)
y_pred=LR.predict(X_test)
print("Before normalization : \n")
MAE=mean_absolute_error(y_test, y_pred)
print("MAE: ", MAE)

MSE=mean_squared_error(y_test, y_pred)
print("MSE: ", MSE)

r2=r2_score(y_test, y_pred)
print("r2 score: ",r2)

In [None]:
#Mini Project: House Price Predictor
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge, Lasso

column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = pd.read_csv('housing.csv', header=None, delimiter=r"\s+", names=column_names)

X=data.drop('MEDV', axis=1)
y=data['MEDV']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

model=LinearRegression()
model.fit(X_train, y_train)
y_pred=model.predict(X_test)

r2=r2_score(y_test, y_pred)
print("R2 score for Linear Regression: ", r2)

ridge_model=Ridge(alpha=1) # r2 score decreases on increasing alpha (observation)
ridge_model.fit(X_train, y_train)
y_ridge_pred=ridge_model.predict(X_test)
r2_ridge=r2_score(y_test, y_ridge_pred)
print("R2 score for Ridge Regression: ", r2_ridge)

lasso_model=Lasso(alpha=1) # r2 score decreases on increasing alpha (observation)
lasso_model.fit(X_train, y_train)
y_lasso_pred=lasso_model.predict(X_test)
r2_lasso=r2_score(y_test, y_lasso_pred)
print("R2 score for Lasso Regression: ", r2_lasso)

In [None]:
#Polynomial regression 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

df=pd.read_csv("poly_ds.csv")
X=df.iloc[:,0]
y=df.iloc[:,1]
X=X.values.reshape(-1,1)

model=LinearRegression()
model.fit(X,y)

poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

poly.fit(X_poly, y)
lin2 = LinearRegression()
lin2.fit(X_poly, y)

plt.scatter(X, y, color='blue')
plt.plot(X, model.predict(X), color='red')
plt.title('Linear Regression')
plt.xlabel('Temperature')
plt.ylabel('Pressure')
plt.show()

plt.scatter(X, y, color='blue')
plt.plot(X, lin2.predict(X_poly), color='red')
plt.title('Polynomial Regression')
plt.xlabel('Temperature')
plt.ylabel('Pressure')
plt.show()


In [None]:
#Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
df=pd.read_csv('D:\\AppStoneLab\\Day 1\\Titanic-Dataset.csv')
df.dropna(inplace=True)

df=pd.get_dummies(data=df,columns=['Sex','Embarked'], dtype='int')

df.drop(columns=['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1, inplace=True)
df.head()

X=df.drop('Survived', axis=1)
y=df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

model=LogisticRegression(solver='saga', random_state=100, max_iter=7000)
model.fit(X_train, y_train)
y_pred=model.predict(X_test)

accuracy=accuracy_score(y_test, y_pred)
precision=precision_score(y_test, y_pred)
recall=recall_score(y_test, y_pred)
f1=f1_score(y_test, y_pred)
roc_auc=roc_auc_score(y_test, y_pred)
cm=confusion_matrix(y_true=y_test, y_pred=y_pred)

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 score: ", f1)
print("ROC_AUC score: ", roc_auc)

sns.heatmap(cm, annot=True)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.plot()


In [None]:
#KNN

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

df=pd.read_csv('KNNAlgorithmDataset.csv')
df.drop('Unnamed: 32',axis=1, inplace=True)

X=df.drop('diagnosis', axis=1)
y=df['diagnosis'].map({'B': 0, 'M': 1})

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.25, random_state=100)
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

knn=KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred=knn.predict(X_test)

f1=f1_score(y_test, y_pred)
roc_auc=roc_auc_score(y_test, y_pred)
cm=confusion_matrix(y_true=y_test, y_pred=y_pred)

print("F1 score: ",f1)
print("ROC_AUC score: ",roc_auc)
sns.heatmap(cm, annot=True)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.plot()

In [None]:
#SVM

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder

df=pd.read_csv('KNNAlgorithmDataset.csv')
df.drop('Unnamed: 32',axis=1, inplace=True)

X=df.drop('diagnosis', axis=1)
y=df['diagnosis'].map({'B': 0, 'M': 1})

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.25, random_state=100)
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

svc=SVC() #parameters can be changed(using default for now)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

f1=f1_score(y_test, y_pred)
roc_auc=roc_auc_score(y_test, y_pred)
cm=confusion_matrix(y_true=y_test, y_pred=y_pred)

print("F1 score: ",f1)
print("ROC_AUC score: ",roc_auc)
sns.heatmap(cm, annot=True)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.plot()

In [None]:
#Gaussian naive bayes
from sklearn.naive_bayes import GaussianNB

df = pd.read_csv('D:\\AppStoneLab\\Day 2\\Iris.csv')

X = df.drop(columns=['Id', 'Species'], axis=1)
y = df['Species'].map({'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

gnb = GaussianNB()
gnb.fit(X_train, y_train)

y_pred_labels = gnb.predict(X_test)        # predicted class labels
y_pred_prob = gnb.predict_proba(X_test)    # predicted probabilities (for ROC-AUC)


f1 = f1_score(y_test, y_pred_labels, average=None)
print("F1 score per class:", f1)

# ROC-AUC score (multiclass, weighted average)
roc_auc = roc_auc_score(
    y_test, 
    y_pred_prob, 
    multi_class='ovr', 
    average='macro'
)
print("ROC-AUC score:", roc_auc)

cm = confusion_matrix(y_test, y_pred_labels)
sns.heatmap(cm, annot=True)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()


In [None]:
#Multinomial Naive Bayes

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix

df=pd.read_csv('spam.csv', encoding='iso8859_14')
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['v2'])
y=df['v1'].map({'ham':0, 'spam':1})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)

f1=f1_score(y_test, y_pred)
roc_auc=roc_auc_score(y_test, y_pred)
cm=confusion_matrix(y_true=y_test, y_pred=y_pred)

print("F1 score: ",f1)
print("ROC_AUC score: ",roc_auc)
sns.heatmap(cm, annot=True)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.plot()

In [None]:
#Decision Tree

from sklearn.tree import DecisionTreeClassifier, plot_tree

df=pd.read_csv('D:\\AppStoneLab\\Day 2\\Iris.csv')

X = df.drop(columns=['Id', 'Species'], axis=1)
y = df['Species'].map({'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

DT=DecisionTreeClassifier(criterion='entropy')
DT.fit(X_train, y_train)
y_pred = DT.predict(X_test)

f1=f1_score(y_test, y_pred, average=None)
# roc_auc=roc_auc_score(y_test, y_pred, multi_class='ovr', average='macro')
cm=confusion_matrix(y_true=y_test, y_pred=y_pred)

print("F1 score: ",f1)
# print("ROC_AUC score: ",roc_auc)
sns.heatmap(cm, annot=True)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.plot()

plt.figure(figsize=(12,8))
plot_tree(DT, feature_names=X.columns, class_names=['Setosa','Versicolor','Virginica'], filled=True, rounded=True)
plt.show()

In [None]:
#Random Forest 

from sklearn.ensemble import RandomForestClassifier

df=pd.read_csv('D:\\AppStoneLab\\Day 2\\Iris.csv')

X = df.drop(columns=['Id', 'Species'], axis=1)
y = df['Species'].map({'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

rf=RandomForestClassifier(n_estimators=100, random_state=100)
rf.fit(X_train, y_train)
y_pred=rf.predict(X_test)

f1=f1_score(y_test, y_pred, average=None)
cm=confusion_matrix(y_true=y_test, y_pred=y_pred)

print("F1 score: ",f1)
sns.heatmap(cm, annot=True)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

feature_importance = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Feature Importance:\n", feature_importance)

# Optional: plot feature importance
feature_importance.plot(kind='bar')
plt.ylabel('Importance')
plt.title('Feature Importance from Random Forest')
plt.show()