# First model
Following the five steps from Ch 5 in Data Science Handbook:
1. Load and arrange data into feature matrix and target vector
2. Choose model class
3. Instantiate model
4. Fit model to data
5. Predict values for new data and evaluate results.

## 1. Load and arrange data 

In [1]:
import sys
print(sys.version)

3.8.18 (default, Sep 11 2023, 08:17:16) 
[Clang 14.0.6 ]


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [3]:
import seaborn as sns

In [34]:
from sklearn.datasets import load_iris
data = load_iris()
data.keys()
print(data['feature_names'])
print(data['target_names'])

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
['setosa' 'versicolor' 'virginica']


In [5]:
X = data['data']
print(type(X))
print(X.shape)

<class 'numpy.ndarray'>
(150, 4)


In [6]:
data = load_iris(as_frame = True)
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [7]:
X = data['data']
print(X.shape)
X.head()

(150, 4)


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [24]:
y = data['target']
print(y.shape)
y.head(150)

(150,)


0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: target, Length: 150, dtype: int64

### Set some data aside

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=498)
print(y_train)


89     1
102    2
26     0
83     1
41     0
      ..
27     0
147    2
69     1
20     0
105    2
Name: target, Length: 112, dtype: int64


In [20]:
print(X_train)

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
89                 5.5               2.5                4.0               1.3
102                7.1               3.0                5.9               2.1
26                 5.0               3.4                1.6               0.4
83                 6.0               2.7                5.1               1.6
41                 4.5               2.3                1.3               0.3
..                 ...               ...                ...               ...
27                 5.2               3.5                1.5               0.2
147                6.5               3.0                5.2               2.0
69                 5.6               2.5                3.9               1.1
20                 5.4               3.4                1.7               0.2
105                7.6               3.0                6.6               2.1

[112 rows x 4 columns]


In [18]:
print(X_test)

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
141                6.9               3.1                5.1               2.3
149                5.9               3.0                5.1               1.8
134                6.1               2.6                5.6               1.4
62                 6.0               2.2                4.0               1.0
1                  4.9               3.0                1.4               0.2
80                 5.5               2.4                3.8               1.1
0                  5.1               3.5                1.4               0.2
63                 6.1               2.9                4.7               1.4
140                6.7               3.1                5.6               2.4
109                7.2               3.6                6.1               2.5
71                 6.1               2.8                4.0               1.3
9                  4.9               3.1                1.5     

In [19]:
print(y_test)

141    2
149    2
134    2
62     1
1      0
80     1
0      0
63     1
140    2
109    2
71     1
9      0
115    2
101    2
46     0
148    2
17     0
108    2
87     1
32     0
145    2
23     0
92     1
93     1
52     1
2      0
128    2
99     1
7      0
22     0
123    2
146    2
3      0
122    2
10     0
30     0
70     1
103    2
Name: target, dtype: int64


## 2. Choose a model class

In [10]:
from sklearn.linear_model import LogisticRegression

## 3. Instantiate model

In [27]:
model = LogisticRegression(max_iter=1500)

## 4. Fit model to data

In [28]:
model.fit(X_train, y_train)

## 5. Predict values for new data and evaluate

In [29]:
y_pred = model.predict(X_test)

In [13]:
print(X_test)

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
141                6.9               3.1                5.1               2.3
149                5.9               3.0                5.1               1.8
134                6.1               2.6                5.6               1.4
62                 6.0               2.2                4.0               1.0
1                  4.9               3.0                1.4               0.2
80                 5.5               2.4                3.8               1.1
0                  5.1               3.5                1.4               0.2
63                 6.1               2.9                4.7               1.4
140                6.7               3.1                5.6               2.4
109                7.2               3.6                6.1               2.5
71                 6.1               2.8                4.0               1.3
9                  4.9               3.1                1.5     

In [30]:
print(y_pred)

[2 2 2 1 0 1 0 1 2 2 1 0 2 2 0 2 0 2 1 0 2 0 1 1 1 0 2 1 0 0 2 2 0 2 0 0 2
 2]


In [31]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9736842105263158

In [32]:
model.score(X_test, y_test)

0.9736842105263158