In [51]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,classification_report

In [3]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [4]:
MPG=sns.load_dataset("mpg")
MPG.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [5]:
MPG["cylinders"].unique()

array([8, 4, 6, 3, 5])

In [6]:
MPG["origin"].unique()

array(['usa', 'japan', 'europe'], dtype=object)

In [7]:
MPG.drop('name',axis=1,inplace=True)

In [8]:
MPG.tail()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
393,27.0,4,140.0,86.0,2790,15.6,82,usa
394,44.0,4,97.0,52.0,2130,24.6,82,europe
395,32.0,4,135.0,84.0,2295,11.6,82,usa
396,28.0,4,120.0,79.0,2625,18.6,82,usa
397,31.0,4,119.0,82.0,2720,19.4,82,usa


In [9]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
MPG['origin']=le.fit_transform(MPG['origin'])

In [10]:
MPG.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,2
1,15.0,8,350.0,165.0,3693,11.5,70,2
2,18.0,8,318.0,150.0,3436,11.0,70,2
3,16.0,8,304.0,150.0,3433,12.0,70,2
4,17.0,8,302.0,140.0,3449,10.5,70,2


In [11]:
MPG.tail()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
393,27.0,4,140.0,86.0,2790,15.6,82,2
394,44.0,4,97.0,52.0,2130,24.6,82,0
395,32.0,4,135.0,84.0,2295,11.6,82,2
396,28.0,4,120.0,79.0,2625,18.6,82,2
397,31.0,4,119.0,82.0,2720,19.4,82,2


In [12]:
MPG.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [13]:
MPG['horsepower'].unique()

array([130., 165., 150., 140., 198., 220., 215., 225., 190., 170., 160.,
        95.,  97.,  85.,  88.,  46.,  87.,  90., 113., 200., 210., 193.,
        nan, 100., 105., 175., 153., 180., 110.,  72.,  86.,  70.,  76.,
        65.,  69.,  60.,  80.,  54., 208., 155., 112.,  92., 145., 137.,
       158., 167.,  94., 107., 230.,  49.,  75.,  91., 122.,  67.,  83.,
        78.,  52.,  61.,  93., 148., 129.,  96.,  71.,  98., 115.,  53.,
        81.,  79., 120., 152., 102., 108.,  68.,  58., 149.,  89.,  63.,
        48.,  66., 139., 103., 125., 133., 138., 135., 142.,  77.,  62.,
       132.,  84.,  64.,  74., 116.,  82.])

In [14]:
from scipy.stats import shapiro

stat,p=shapiro(MPG['horsepower'].dropna())
print("p.value:",p)

p.value: 5.0220692907916354e-15


In [15]:
MPG['horsepower'].fillna(MPG['horsepower'].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  MPG['horsepower'].fillna(MPG['horsepower'].mean(),inplace=True)


In [16]:
MPG.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [18]:
MPG.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mpg,398.0,23.514573,7.815984,9.0,17.5,23.0,29.0,46.6
cylinders,398.0,5.454774,1.701004,3.0,4.0,4.0,8.0,8.0
displacement,398.0,193.425879,104.269838,68.0,104.25,148.5,262.0,455.0
horsepower,398.0,104.469388,38.199187,46.0,76.0,95.0,125.0,230.0
weight,398.0,2970.424623,846.841774,1613.0,2223.75,2803.5,3608.0,5140.0
acceleration,398.0,15.56809,2.757689,8.0,13.825,15.5,17.175,24.8
model_year,398.0,76.01005,3.697627,70.0,73.0,76.0,79.0,82.0
origin,398.0,1.449749,0.775076,0.0,1.0,2.0,2.0,2.0


In [21]:
MPG.drop_duplicates()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,2
1,15.0,8,350.0,165.0,3693,11.5,70,2
2,18.0,8,318.0,150.0,3436,11.0,70,2
3,16.0,8,304.0,150.0,3433,12.0,70,2
4,17.0,8,302.0,140.0,3449,10.5,70,2
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,2
394,44.0,4,97.0,52.0,2130,24.6,82,0
395,32.0,4,135.0,84.0,2295,11.6,82,2
396,28.0,4,120.0,79.0,2625,18.6,82,2


In [22]:
MPG.shape

(398, 8)

In [23]:
MPG.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model_year        int64
origin            int64
dtype: object

In [24]:
x=MPG.drop(labels="cylinders",axis=1)
x

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,307.0,130.0,3504,12.0,70,2
1,15.0,350.0,165.0,3693,11.5,70,2
2,18.0,318.0,150.0,3436,11.0,70,2
3,16.0,304.0,150.0,3433,12.0,70,2
4,17.0,302.0,140.0,3449,10.5,70,2
...,...,...,...,...,...,...,...
393,27.0,140.0,86.0,2790,15.6,82,2
394,44.0,97.0,52.0,2130,24.6,82,0
395,32.0,135.0,84.0,2295,11.6,82,2
396,28.0,120.0,79.0,2625,18.6,82,2


In [25]:
y=MPG["cylinders"]
y

0      8
1      8
2      8
3      8
4      8
      ..
393    4
394    4
395    4
396    4
397    4
Name: cylinders, Length: 398, dtype: int64

In [53]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.80,shuffle=True,random_state=10)
x_train

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,model_year,origin
303,31.8,85.0,65.0,2020,19.2,79,1
347,37.0,85.0,65.0,1975,19.4,81,1
149,24.0,120.0,97.0,2489,15.0,74,1
100,18.0,250.0,88.0,3021,16.5,73,2
175,29.0,90.0,70.0,1937,14.0,75,0
...,...,...,...,...,...,...,...
369,34.0,112.0,88.0,2395,18.0,82,2
320,37.0,119.0,92.0,2434,15.0,80,1
15,22.0,198.0,95.0,2833,15.5,70,2
125,20.0,198.0,95.0,3102,16.5,74,2


In [54]:
x_test

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,model_year,origin
331,33.8,97.0,67.0,2145,18.0,80,1
111,18.0,70.0,90.0,2124,13.5,73,1
350,34.7,105.0,63.0,2215,14.9,81,2
205,28.0,97.0,75.0,2155,16.4,76,1
56,26.0,91.0,70.0,1955,20.5,71,2
...,...,...,...,...,...,...,...
304,37.3,91.0,69.0,2130,14.7,79,0
187,17.5,305.0,140.0,4215,13.0,76,2
166,13.0,302.0,129.0,3169,12.0,75,2
64,15.0,318.0,150.0,4135,13.5,72,2


In [55]:
y_train

303    4
347    4
149    4
100    6
175    4
      ..
369    4
320    4
15     6
125    6
265    8
Name: cylinders, Length: 318, dtype: int64

In [56]:
y_test

331    4
111    3
350    4
205    4
56     4
      ..
304    4
187    8
166    8
64     8
27     8
Name: cylinders, Length: 80, dtype: int64

# model building

In [57]:
model=LogisticRegression()

In [58]:
model.fit(x_train,y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [59]:
y_train_pred=model.predict(x_train)

In [60]:
y_test_pred=model.predict(x_test)

In [61]:
accuracy_score(y_train,y_train_pred)*100

96.54088050314465

In [62]:
accuracy_score(y_test,y_test_pred)*100

93.75

In [63]:
confusion_matrix(y_test,y_test_pred)

array([[ 0,  1,  0,  0,  0],
       [ 0, 37,  0,  0,  0],
       [ 0,  1,  0,  0,  0],
       [ 0,  2,  0, 17,  0],
       [ 0,  0,  0,  1, 21]])

In [67]:
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.90      1.00      0.95        37
           5       0.00      0.00      0.00         1
           6       0.94      0.89      0.92        19
           8       1.00      0.95      0.98        22

    accuracy                           0.94        80
   macro avg       0.57      0.57      0.57        80
weighted avg       0.92      0.94      0.93        80



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
