In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## 1

In [5]:
df = pd.read_csv('/Users/anantagarwal/Downloads/data.csv')

## 2

In [8]:
print("Dataset entries:\n", df)

Dataset entries:
        age          job  marital            education  default housing loan  \
0       44  blue-collar  married             basic.4y  unknown     yes   no   
1       53   technician  married              unknown       no      no   no   
2       28   management   single    university.degree       no     yes   no   
3       39     services  married          high.school       no      no   no   
4       55      retired  married             basic.4y       no     yes   no   
...    ...          ...      ...                  ...      ...     ...  ...   
41183   59      retired  married          high.school  unknown      no  yes   
41184   31    housemaid  married             basic.4y  unknown      no   no   
41185   42       admin.   single    university.degree  unknown     yes  yes   
41186   48   technician  married  professional.course       no      no  yes   
41187   25      student   single          high.school       no      no   no   

         contact month day_of_wee

## 3

In [11]:
print("Shape of the dataset:", df.shape)

Shape of the dataset: (41188, 21)


## 4

In [14]:
print("Column names:", df.columns)

Column names: Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp_var_rate', 'cons_price_idx',
       'cons_conf_idx', 'euribor3m', 'nr_employed', 'y'],
      dtype='object')


## 5

In [17]:
if df.isnull().sum().sum() > 0:
    df.fillna(df.mean(), inplace=True)
print("Missing values in each column:\n", df.isnull().sum())

Missing values in each column:
 age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp_var_rate      0
cons_price_idx    0
cons_conf_idx     0
euribor3m         0
nr_employed       0
y                 0
dtype: int64


## 6

In [37]:
categorical_columns=df.select_dtypes(include=["object"]).columns
df=pd.get_dummies(df, columns=categorical_columns)

In [39]:
X=df.drop("y", axis=1)
y=df["y"]

In [41]:
X_train_70, X_test_30, y_train_70, y_test_30=train_test_split(X, y, test_size=0.3, random_state=42)

In [43]:
X_train_80, X_test_20, y_train_80, y_test_20=train_test_split(X, y, test_size=0.2, random_state=42)

## 7

In [45]:
model_70_30=LogisticRegression(max_iter=10000)
model_70_30.fit(X_train_70, y_train_70)

model_80_20=LogisticRegression(max_iter=10000)
model_80_20.fit(X_train_80, y_train_80)

## 8

In [50]:
print("Regression coefficients for 70:30 split:", model_70_30.coef_)
print("Regression coefficients for 80:20 split:", model_80_20.coef_)

Regression coefficients for 70:30 split: [[-5.82522343e-04  4.92701899e-03 -5.09984703e-02 -9.71793573e-04
  -4.62899168e-02 -1.16031652e+00  6.85373307e-01 -1.11445530e-02
   8.63462015e-01 -1.37242868e-02  4.27968414e-02 -1.79894787e-01
  -1.50773167e-02  3.63789039e-02  7.02211572e-02  1.53162012e-01
  -2.01473841e-01 -7.24811741e-02  1.67912672e-01  1.53137184e-02
  -1.19991881e-01  9.85788068e-02 -2.37092014e-02  5.24384110e-03
   6.17649750e-02 -4.78545026e-02 -4.37313731e-02  4.73736808e-02
  -1.30844371e-01 -4.68511269e-02  1.15031723e-01 -1.33549017e-02
   5.39380226e-02  1.38834578e-02  1.47205969e-01 -1.49755601e-01
  -2.00525569e-03  9.40881353e-04 -1.01110169e-02  4.61524774e-03
   6.58521428e-02 -1.01110169e-02 -6.02960137e-02  2.64591651e-01
  -2.69146539e-01 -4.27707058e-03  4.52889009e-01 -2.62932041e-02
   2.17676623e-01  5.76644045e-03  1.45146245e+00 -6.17612592e-01
  -6.97748523e-01 -4.38042564e-01 -3.48375457e-01 -1.89143564e-02
  -1.40435403e-01  9.51740037e-03  

## 9

In [55]:
equation_70 = f"SLR Equation for 70:30 split: y = {model_70_30.intercept_[0]:.2f}"
equation_80 = f"SLR Equation for 80:20 split: y = {model_80_20.intercept_[0]:.2f}"

In [57]:
for i, col in enumerate(X.columns):
    equation_70 += f" + {model_70_30.coef_[0][i]:.2f}*{col}"
    equation_80 += f" + {model_80_20.coef_[0][i]:.2f}*{col}"

In [59]:
print(equation_70)
print(equation_80)

SLR Equation for 70:30 split: y = -0.00 + -0.00*age + 0.00*duration + -0.05*campaign + -0.00*pdays + -0.05*previous + -1.16*emp_var_rate + 0.69*cons_price_idx + -0.01*cons_conf_idx + 0.86*euribor3m + -0.01*nr_employed + 0.04*job_admin. + -0.18*job_blue-collar + -0.02*job_entrepreneur + 0.04*job_housemaid + 0.07*job_management + 0.15*job_retired + -0.20*job_self-employed + -0.07*job_services + 0.17*job_student + 0.02*job_technician + -0.12*job_unemployed + 0.10*job_unknown + -0.02*marital_divorced + 0.01*marital_married + 0.06*marital_single + -0.05*marital_unknown + -0.04*education_basic.4y + 0.05*education_basic.6y + -0.13*education_basic.9y + -0.05*education_high.school + 0.12*education_illiterate + -0.01*education_professional.course + 0.05*education_university.degree + 0.01*education_unknown + 0.15*default_no + -0.15*default_unknown + -0.00*default_yes + 0.00*housing_no + -0.01*housing_unknown + 0.00*housing_yes + 0.07*loan_no + -0.01*loan_unknown + -0.06*loan_yes + 0.26*contact_ce

## 10

In [62]:
y_pred_30 = model_70_30.predict(X_test_30)
y_pred_20 = model_80_20.predict(X_test_20)

## 11

In [65]:
accuracy_70 = accuracy_score(y_test_30, y_pred_30)
accuracy_80 = accuracy_score(y_test_20, y_pred_20)

print("Accuracy for 70:30 split:", accuracy_70)
print("Accuracy for 80:20 split:", accuracy_80)

Accuracy for 70:30 split: 0.9065307113377034
Accuracy for 80:20 split: 0.9079873755765963


In [67]:
for feature in X.columns:
    X_feature = X[[feature]]
    X_train_feature, X_test_feature, y_train_feature, y_test_feature = train_test_split(X_feature, y, test_size=0.3, random_state=42)
    
    model = LogisticRegression()
    model.fit(X_train_feature, y_train_feature)
    
    y_pred_feature = model.predict(X_test_feature)
    accuracy_feature = accuracy_score(y_test_feature, y_pred_feature)
    
    print(f"Accuracy with {feature} as the independent variable: {accuracy_feature}")

Accuracy with age as the independent variable: 0.8845998219632597
Accuracy with duration as the independent variable: 0.8874322246499959
Accuracy with campaign as the independent variable: 0.8845998219632597
Accuracy with pdays as the independent variable: 0.8948773974265598
Accuracy with previous as the independent variable: 0.8865420409484502
Accuracy with emp_var_rate as the independent variable: 0.8845998219632597
Accuracy with cons_price_idx as the independent variable: 0.8845998219632597
Accuracy with cons_conf_idx as the independent variable: 0.8845998219632597
Accuracy with euribor3m as the independent variable: 0.8845998219632597
Accuracy with nr_employed as the independent variable: 0.8843570445901109
Accuracy with job_admin. as the independent variable: 0.8845998219632597
Accuracy with job_blue-collar as the independent variable: 0.8845998219632597
Accuracy with job_entrepreneur as the independent variable: 0.8845998219632597
Accuracy with job_housemaid as the independent va