In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
#generates a synthetic dataset for binary or multiclass classification.

#Generate a synthetic binary classification dataset
X,y = make_classification(n_samples=200,n_features = 2,n_informative=2,n_redundant=0,random_state=0)

In [59]:
#Split the data into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42) #Default is 75% training and 25% testing

In [60]:
#Initialize logistic regression with regularization strength C=1.0 (default)
clf = LogisticRegression(C=1.0).fit(X_train,y_train)

In [61]:
#Predict probabilities for the positive class (class 1)
probs = clf.predict_proba(X_test)[:,1] #This step extracts only the probability for class 1
#Each value between 0 and 1
#Apply the default threshold 0.5 to get predicted labels
labels_default = (probs>=0.5).astype(int)

In [62]:
#Apply a custom threshold 0.7 to get predicted labels
labels_custom = (probs>=0.7).astype(int)

In [63]:
#Print accuracy for both threshold settings
print("Accuracy (t=0.5):",(labels_default==y_test).mean())
print("Accuracy (t=0.7):",(labels_custom==y_test).mean())

Accuracy (t=0.5): 0.98
Accuracy (t=0.7): 0.98


In [64]:
#Metrics
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_score,f1_score,roc_auc_score

In [65]:
#Generate synthetic dataset -> 5 features, 3 of them infor,500 samples
X,y = make_classification(n_samples=500,n_features=5,n_informative=3,n_redundant=0,random_state=0)

In [66]:
#train test split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [67]:
#Do model/fitting
model=LogisticRegression().fit(X_train,y_train)

In [68]:
#give me predict() for test values
y_pred = model.predict(X_test)

In [69]:
#y_pred,y_test
#probs = predict probs for class 1 -> ROC-AUC value
y_probs = model.predict_proba(X_test)[:,1]

In [70]:
#CM -> TN,TP 0->0 1->1
#FN,FP ->1->0 FN,0 -> 1,FP
cm = confusion_matrix(y_test,y_pred)
print(f"Confusion Matrix:\n{cm}")

Confusion Matrix:
[[61 10]
 [11 43]]


In [71]:
#accuracy = TP+TN /(Total Prediction) #correct predictions
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy:{accuracy:.3f}")

Accuracy:0.832


In [72]:
#Precision = TP/TP+FP #correct positive predictions
precision = precision_score(y_test,y_pred)
print(f"Precision:{precision:.3f}")

Precision:0.811


In [73]:
#recall =TP/TP+FN # how many positives were correctly identified
recall=recall_score(y_test,y_pred)
print(f"Recall:{recall:.3f}")

Recall:0.796


In [74]:
# 2*((Precision*Recall)/(Precision+Recall))
f1 = f1_score(y_test,y_pred)
print(f"F1-Score:{f1:.3f}")

F1-Score:0.804


In [75]:
#Roc-auc value -> separated +ve and -ve classes
#1.0 -> perfect separation
#0.5 -> random guesses
#<0.5 -> incorrect guesses
roc_auc = roc_auc_score(y_test,y_pred)
print(f"ROC-AUC score: {roc_auc:.3f}")

ROC-AUC score: 0.828


In [76]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from itertools import product

In [77]:
# ---Step 1: Create synthetic binary classification data --
X,y = make_classification(n_samples=200,n_features=5,random_state=42)

In [78]:
#--- Step 2: Train logistic Regression ---
model = LogisticRegression()
model.fit(X,y)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [79]:
# ---Step 3: Predict probabilities for class 1 ---
probs = model.predict_proba(X)[:,1]

In [80]:
# Step 4: Separate event and non-event scores
event_scores = probs[y==1]
non_event_scores = probs[y==0]

concordant=discordant=tied=0

In [81]:
#Step 5: Compare all event/non-event pairs
for e,ne in product(event_scores,non_event_scores):
    if e>ne:
        concordant+=1
    elif e<ne:
        discordant+=1
    else:
        tied+=1
total_pairs = concordant+discordant+tied

In [82]:
#Step 6: compute metrics
conc_ratio_simple = concordant/total_pairs
disc_ratio = discordant/total_pairs
conc_ratio_tie= (concordant+0.5*tied)/total_pairs
cd_ratio = conc_ratio_simple/disc_ratio if disc_ratio>0 else np.inf

In [83]:
#Step 7: Compute ROC-AUC
roc_auc = roc_auc_score(y,probs)

In [84]:
#Step 8: Display results:
print(f"%Concordant: {100*conc_ratio_simple:.4f}")
print(f"%Discordant:{100*disc_ratio:.4f}")
print(f"%Tied: {100*(tied/total_pairs):.4f}")
print(f"Concordance Ratio (Simple):{conc_ratio_simple:.4f}")
print(f"Concordance Ratio (Tie-Adjusted): {conc_ratio_tie:.4f}")
print(f"Discordance Ratio:{disc_ratio:.4f}")
print(f"C/D Ratio:{cd_ratio:.4f}")

%Concordant: 92.3992
%Discordant:7.6008
%Tied: 0.0000
Concordance Ratio (Simple):0.9240
Concordance Ratio (Tie-Adjusted): 0.9240
Discordance Ratio:0.0760
C/D Ratio:12.1566


In [85]:
from sklearn.datasets import load_iris
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report

In [86]:
#Load iris data
iris = load_iris()
X,y = iris.data,iris.target

In [87]:
#Split data into train and test sets
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,test_size=0.3)

In [88]:
#Initialize and train Gaussian Naive Bayes model
model = GaussianNB()
model.fit(X_train,y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [89]:
#Predict on test set
y_pred = model.predict(X_test)

In [90]:
#Evaluate accuracy
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy:{accuracy:.3f}")

Accuracy:0.978


In [91]:
#Detailed classification report
print(classification_report(y_test,y_pred,target_names=iris.target_names))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        19
  versicolor       1.00      0.92      0.96        13
   virginica       0.93      1.00      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.97      0.97        45
weighted avg       0.98      0.98      0.98        45



In [92]:
#New Sample
sample = np.array([[6.0,3.0,4.8,1.8]])

In [93]:
#Predict class probabilities for the sample (posterior probabilities)
posterior_probs = model.predict_proba(sample)
print(f"Posterior probabilities for the sample {sample.flatten()}:")
for class_name,prob in zip(iris.target_names,posterior_probs[0]):
    print(f"{class_name}:{prob:.4f}")

Posterior probabilities for the sample [6.  3.  4.8 1.8]:
setosa:0.0000
versicolor:0.1321
virginica:0.8679


In [94]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report

In [99]:
#Load a subset of the 20 newsgroups dataset
categories = ['rec.sport.baseball', 'sci.med', 'comp.graphics']
newsgroups = fetch_20newsgroups(subset='all', categories=categories)


URLError: <urlopen error [Errno 11001] getaddrinfo failed>