In [1]:
!gdown 1uUt7uL-VuF_5cpodYRiriEwhsldeEp3m

Downloading...
From: https://drive.google.com/uc?id=1uUt7uL-VuF_5cpodYRiriEwhsldeEp3m
To: /content/churn_logistic.csv
  0% 0.00/494k [00:00<?, ?B/s]100% 494k/494k [00:00<00:00, 39.0MB/s]


In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
churn = pd.read_csv("churn_logistic.csv")
churn.head()

Unnamed: 0,Account Length,VMail Message,Day Mins,Eve Mins,Night Mins,Intl Mins,CustServ Calls,Intl Plan,VMail Plan,Day Calls,Day Charge,Eve Calls,Eve Charge,Night Calls,Night Charge,Intl Calls,Intl Charge,State,Area Code,Phone,Churn
0,128,25,265.1,197.4,244.7,10.0,1,0,1,110,45.07,99,16.78,91,11.01,3,2.7,KS,415,382-4657,0
1,107,26,161.6,195.5,254.4,13.7,1,0,1,123,27.47,103,16.62,103,11.45,3,3.7,OH,415,371-7191,0
2,137,0,243.4,121.2,162.6,12.2,0,0,0,114,41.38,110,10.3,104,7.32,5,3.29,NJ,415,358-1921,0
3,84,0,299.4,61.9,196.9,6.6,2,1,0,71,50.9,88,5.26,89,8.86,7,1.78,OH,408,375-9999,0
4,75,0,166.7,148.3,186.9,10.1,3,1,0,113,28.34,122,12.61,121,8.41,3,2.73,OK,415,330-6626,0


In [3]:
cols = ['Day Mins', 'Eve Mins', 'Night Mins', 'CustServ Calls', 'Account Length']
y = churn["Churn"]
X = churn[cols]
X.shape

(5700, 5)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4560, 5), (1140, 5), (4560,), (1140,))

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score, precision_score, f1_score

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), cols)
    ])

model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression())
    ])

In [8]:
model.fit(X_train, y_train)

In [14]:
model.named_steps['classifier'].coef_

array([[0.72176541, 0.2786646 , 0.10183591, 0.7869764 , 0.03877948]])

In [15]:
model.named_steps['classifier'].intercept_

array([0.01124919])

In [16]:
model.predict_proba(X_test)

array([[0.23977086, 0.76022914],
       [0.50641509, 0.49358491],
       [0.78825707, 0.21174293],
       ...,
       [0.53327489, 0.46672511],
       [0.22085291, 0.77914709],
       [0.74399061, 0.25600939]])

In [10]:
y_hat = model.predict(X_test)

In [17]:
y_hat

array([1, 0, 0, ..., 0, 1, 0])

In [13]:
accuracy_score(y_test, y_hat)

0.7114035087719298

In [18]:
accuracy_score(y_train, model.predict(X_train))

0.712280701754386

In [26]:
model_lasso = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(penalty = 'l1', solver = 'liblinear'))
    ])

In [27]:
model_lasso.fit(X_train, y_train)

In [28]:
model_lasso.named_steps['classifier'].coef_

array([[0.72133769, 0.27781887, 0.10081401, 0.7863674 , 0.0376966 ]])

In [29]:
model_lasso.named_steps['classifier'].intercept_

array([0.01019989])

In [30]:
model_lasso.predict_proba(X_test)

array([[0.2401027 , 0.7598973 ],
       [0.50600081, 0.49399919],
       [0.78825576, 0.21174424],
       ...,
       [0.53327255, 0.46672745],
       [0.22149173, 0.77850827],
       [0.74369853, 0.25630147]])

In [31]:
y_hat = model_lasso.predict(X_test)

In [32]:
accuracy_score(y_test, y_hat)

0.7096491228070175

In [33]:
accuracy_score(y_train, model_lasso.predict(X_train))

0.7109649122807018

In [34]:
from sklearn.linear_model import LogisticRegressionCV

In [50]:
model_ridgeCV = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegressionCV(penalty = 'l2', solver = 'liblinear', cv = 5))
    ])

In [51]:
model_ridgeCV.fit(X_train, y_train)

In [52]:
model_ridgeCV.named_steps['classifier'].coef_

array([[0.56623502, 0.22780008, 0.08219862, 0.60891194, 0.03003867]])

In [53]:
model_ridgeCV.named_steps['classifier'].intercept_

array([0.0068881])

In [55]:
model_ridgeCV.named_steps['classifier'].C_

array([0.00599484])

In [54]:
accuracy_score(y_test, model_ridgeCV.predict(X_test))

0.712280701754386

In [41]:
accuracy_score(y_train, model_ridgeCV.predict(X_train))

0.7131578947368421

In [43]:
model_lassoCV = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegressionCV(penalty = 'l1', solver = 'liblinear', cv = 5))
    ])

In [44]:
model_lassoCV.fit(X_train, y_train)

In [45]:
model_lassoCV.named_steps['classifier'].coef_

array([[0.72244576, 0.27863646, 0.10158354, 0.78781352, 0.0384417 ]])

In [46]:
model_lassoCV.named_steps['classifier'].intercept_

array([0.01090578])

In [49]:
model_lassoCV.named_steps['classifier'].C_

array([2.7825594])

In [47]:
accuracy_score(y_test, model_lassoCV.predict(X_test))

0.7096491228070175

In [48]:
accuracy_score(y_train, model_lassoCV.predict(X_train))

0.7109649122807018

In [57]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/_apna_college/loan_approval_data.csv")
df.head()

Unnamed: 0,Applicant_ID,Applicant_Income,Coapplicant_Income,Employment_Status,Age,Marital_Status,Dependents,Credit_Score,Existing_Loans,DTI_Ratio,Savings,Collateral_Value,Loan_Amount,Loan_Term,Loan_Purpose,Property_Area,Education_Level,Gender,Employer_Category,Loan_Approved
0,1.0,17795.0,1387.0,Salaried,51.0,Married,0.0,637.0,4.0,0.53,19403.0,45638.0,16619.0,84.0,Personal,Urban,Not Graduate,Female,Private,No
1,2.0,2860.0,2679.0,Salaried,46.0,Married,3.0,621.0,2.0,0.3,2580.0,49272.0,38687.0,,Car,Semiurban,Graduate,,Private,No
2,3.0,7390.0,2106.0,Salaried,25.0,Single,2.0,674.0,4.0,0.2,13844.0,6908.0,27943.0,72.0,,Urban,,Female,Government,Yes
3,4.0,13964.0,8173.0,Salaried,40.0,Married,2.0,579.0,3.0,0.31,9553.0,10844.0,27819.0,60.0,Business,Rural,Graduate,Female,Government,No
4,5.0,13284.0,4223.0,Self-employed,31.0,Single,2.0,721.0,1.0,0.29,9386.0,37629.0,12741.0,72.0,Car,,Graduate,Male,Private,Yes


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Applicant_ID        950 non-null    float64
 1   Applicant_Income    950 non-null    float64
 2   Coapplicant_Income  950 non-null    float64
 3   Employment_Status   950 non-null    object 
 4   Age                 950 non-null    float64
 5   Marital_Status      950 non-null    object 
 6   Dependents          950 non-null    float64
 7   Credit_Score        950 non-null    float64
 8   Existing_Loans      950 non-null    float64
 9   DTI_Ratio           950 non-null    float64
 10  Savings             950 non-null    float64
 11  Collateral_Value    950 non-null    float64
 12  Loan_Amount         950 non-null    float64
 13  Loan_Term           950 non-null    float64
 14  Loan_Purpose        950 non-null    object 
 15  Property_Area       950 non-null    object 
 16  Educati

In [59]:
df.drop(columns = ["Applicant_ID"], inplace = True)

In [64]:
df.isna().sum()

Unnamed: 0,0
Applicant_Income,50
Coapplicant_Income,50
Employment_Status,50
Age,50
Marital_Status,50
Dependents,50
Credit_Score,50
Existing_Loans,50
DTI_Ratio,50
Savings,50


In [68]:
df.duplicated().sum()

np.int64(0)

In [74]:
for i in df.columns:
    if df[i].dtype == "object":
        print(df[i].value_counts(dropna = False))
        print()

Employment_Status
Salaried         465
Contract         213
Self-employed    182
Unemployed        90
NaN               50
Name: count, dtype: int64

Marital_Status
Married    593
Single     357
NaN         50
Name: count, dtype: int64

Loan_Purpose
Business     202
Car          199
Home         190
Education    181
Personal     178
NaN           50
Name: count, dtype: int64

Property_Area
Urban        467
Rural        294
Semiurban    189
NaN           50
Name: count, dtype: int64

Education_Level
Graduate        672
Not Graduate    278
NaN              50
Name: count, dtype: int64

Gender
Male      571
Female    379
NaN        50
Name: count, dtype: int64

Employer_Category
Private       372
Government    202
MNC           144
Business      135
Unemployed     97
NaN            50
Name: count, dtype: int64

Loan_Approved
No     652
Yes    298
NaN     50
Name: count, dtype: int64



In [75]:
df.dropna(subset = ['Loan_Approved'], inplace = True)
df

Unnamed: 0,Applicant_Income,Coapplicant_Income,Employment_Status,Age,Marital_Status,Dependents,Credit_Score,Existing_Loans,DTI_Ratio,Savings,Collateral_Value,Loan_Amount,Loan_Term,Loan_Purpose,Property_Area,Education_Level,Gender,Employer_Category,Loan_Approved
0,17795.0,1387.0,Salaried,51.0,Married,0.0,637.0,4.0,0.53,19403.0,45638.0,16619.0,84.0,Personal,Urban,Not Graduate,Female,Private,No
1,2860.0,2679.0,Salaried,46.0,Married,3.0,621.0,2.0,0.30,2580.0,49272.0,38687.0,,Car,Semiurban,Graduate,,Private,No
2,7390.0,2106.0,Salaried,25.0,Single,2.0,674.0,4.0,0.20,13844.0,6908.0,27943.0,72.0,,Urban,,Female,Government,Yes
3,13964.0,8173.0,Salaried,40.0,Married,2.0,579.0,3.0,0.31,9553.0,10844.0,27819.0,60.0,Business,Rural,Graduate,Female,Government,No
4,13284.0,4223.0,Self-employed,31.0,Single,2.0,721.0,1.0,0.29,9386.0,37629.0,12741.0,72.0,Car,,Graduate,Male,Private,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,,9092.0,Salaried,58.0,Married,0.0,557.0,0.0,0.59,5370.0,43563.0,8311.0,72.0,Personal,,Not Graduate,Male,Unemployed,No
996,3279.0,6356.0,Self-employed,58.0,Married,1.0,646.0,3.0,0.19,,18361.0,22563.0,12.0,Business,Urban,Graduate,Female,Government,No
997,15192.0,8433.0,Contract,48.0,Single,1.0,666.0,1.0,0.40,8581.0,41335.0,16203.0,24.0,Home,Rural,Graduate,Male,MNC,No
998,9083.0,7380.0,Unemployed,50.0,Single,1.0,748.0,3.0,0.31,13491.0,8933.0,10290.0,36.0,Personal,Urban,Graduate,Male,Private,Yes


In [77]:
y = df["Loan_Approved"].map({"Yes": 1, "No": 0})
X = df.drop(columns="Loan_Approved")

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y ,random_state = 42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((760, 18), (190, 18), (760,), (190,))

In [82]:
y_train.value_counts(normalize = True), y_test.value_counts(normalize = True)

(Loan_Approved
 0    0.686842
 1    0.313158
 Name: proportion, dtype: float64,
 Loan_Approved
 0    0.684211
 1    0.315789
 Name: proportion, dtype: float64)

In [96]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

# column lists
cat_cols = X.select_dtypes(include="object").columns
num_cols = X.select_dtypes(exclude="object").columns

# categorical pipeline
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# numerical pipeline
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# column transformer
preprocessor = ColumnTransformer([
    ("cat", cat_pipeline, cat_cols),
    ("num", num_pipeline, num_cols)
])

# full model pipeline
model = Pipeline([
    ("preprocess", preprocessor),
    ("clf", LogisticRegression(max_iter=1000))
])


In [97]:
# fit
model.fit(X_train, y_train)

In [98]:
# predict
y_pred = model.predict(X_test)

In [99]:
accuracy_score(y_test, y_pred), recall_score(y_test, y_pred), precision_score(y_test, y_pred)

(0.8526315789473684, 0.7166666666666667, 0.7962962962962963)

In [100]:
accuracy_score(y_train, model.predict(X_train)), recall_score(y_train, model.predict(X_train)), precision_score(y_train, model.predict(X_train))

(0.8842105263157894, 0.7815126050420168, 0.8378378378378378)

In [103]:
# full model pipeline
model = Pipeline([
    ("preprocess", preprocessor),
    ("clf", LogisticRegressionCV(penalty='l1', solver = 'liblinear'))
])

In [104]:
# fit
model.fit(X_train, y_train)

In [106]:
accuracy_score(y_test, model.predict(X_test)), recall_score(y_test, model.predict(X_test)), precision_score(y_test, model.predict(X_test))

(0.8842105263157894, 0.7833333333333333, 0.8392857142857143)

In [107]:
accuracy_score(y_train, model.predict(X_train)), recall_score(y_train, model.predict(X_train)), precision_score(y_train, model.predict(X_train))

(0.8842105263157894, 0.7899159663865546, 0.831858407079646)

In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Applicant_Income    950 non-null    float64
 1   Coapplicant_Income  950 non-null    float64
 2   Employment_Status   950 non-null    object 
 3   Age                 950 non-null    float64
 4   Marital_Status      950 non-null    object 
 5   Dependents          950 non-null    float64
 6   Credit_Score        950 non-null    float64
 7   Existing_Loans      950 non-null    float64
 8   DTI_Ratio           950 non-null    float64
 9   Savings             950 non-null    float64
 10  Collateral_Value    950 non-null    float64
 11  Loan_Amount         950 non-null    float64
 12  Loan_Term           950 non-null    float64
 13  Loan_Purpose        950 non-null    object 
 14  Property_Area       950 non-null    object 
 15  Education_Level     950 non-null    object 
 16  Gender 