## Packages

In [446]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from math import sqrt

---

## Load data

In [2]:
fish = pd.read_csv("fish_market.csv")

In [3]:
fish.head()

Unnamed: 0.1,Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width,Female
0,0,Bream,242.0,23.2,25.4,30.0,11.52,4.02,0
1,1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056,1
2,2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961,0
3,3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555,0
4,4,Bream,430.0,26.5,29.0,34.0,12.444,5.134,1


In [4]:
fish.columns

Index(['Unnamed: 0', 'Species', 'Weight', 'Length1', 'Length2', 'Length3',
       'Height', 'Width', 'Female'],
      dtype='object')

In [5]:
fish.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176 entries, 0 to 175
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  176 non-null    int64  
 1   Species     176 non-null    object 
 2   Weight      176 non-null    float64
 3   Length1     176 non-null    float64
 4   Length2     176 non-null    float64
 5   Length3     176 non-null    float64
 6   Height      176 non-null    float64
 7   Width       176 non-null    float64
 8   Female      176 non-null    int64  
dtypes: float64(6), int64(2), object(1)
memory usage: 12.5+ KB


In [6]:
fish.drop("Unnamed: 0", axis=1, inplace=True)

In [7]:
fish.columns

Index(['Species', 'Weight', 'Length1', 'Length2', 'Length3', 'Height', 'Width',
       'Female'],
      dtype='object')

---

## Encoding

In [8]:
fish.Species.unique()

array(['Bream', 'Roach', 'Whitefish', 'Parkki', 'Perch', 'Pike', 'Smelt',
       ' Smelt'], dtype=object)

In [9]:
fish.Species = fish.Species.str.strip()

In [10]:
LabelEncoder_ = preprocessing.LabelEncoder()

In [11]:
fish.Species = LabelEncoder_.fit_transform(fish.Species)

In [12]:
fish.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width,Female
0,0,242.0,23.2,25.4,30.0,11.52,4.02,0
1,0,290.0,24.0,26.3,31.2,12.48,4.3056,1
2,0,340.0,23.9,26.5,31.1,12.3778,4.6961,0
3,0,363.0,26.3,29.0,33.5,12.73,4.4555,0
4,0,430.0,26.5,29.0,34.0,12.444,5.134,1


---

## Divide data into Train and Test Sets

In [13]:
features = fish[['Species', 'Weight', 'Length1', 'Length2', 'Length3', 'Height', 'Width']]
target = fish.Female

In [14]:
features.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,0,242.0,23.2,25.4,30.0,11.52,4.02
1,0,290.0,24.0,26.3,31.2,12.48,4.3056
2,0,340.0,23.9,26.5,31.1,12.3778,4.6961
3,0,363.0,26.3,29.0,33.5,12.73,4.4555
4,0,430.0,26.5,29.0,34.0,12.444,5.134


In [15]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=17)

In [16]:
x_train.shape, y_train.shape

((140, 7), (140,))

In [17]:
x_test.shape, y_test.shape

((36, 7), (36,))

---

## Train model

In [601]:
model = tree.DecisionTreeClassifier(class_weight='balanced')

In [602]:
model.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced', criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

---

## Test the model

In [603]:
y_predicted = model.predict(x_test)

In [604]:
print("Accuracy of the model:", metrics.accuracy_score(y_test, y_predicted))

Accuracy of the model: 0.8055555555555556


---

### Laboratory task - use DecisionTreeClassifier() model with different parameters. Try to understand how they influence the model accuracy.

1.1 Criterion: entropy --- nothing changed

1.2 Splitter: random --- changed the accuracy. Accuracy of our model decreased

1.3 Max depth: 
</br>if 1 accuracy = 0.55 and with increasing depth, the accuracy will also increase, but it only reaches the value 13 and the accuracy 0.83, and then begins to decrease

1.4 min_samples_split: with increasing min_samples_split accuracy decreases to 0.7222

1.5 min_samples_leaf: accuracy changes every time, even if we don't change the min_samples_leaf value

1.6 min_weight_fraction_leaf: with an increase in this value, the accuracy decreases to 0.5

1.7 max_features: with a change in max_features, the accuracy changes randomly

1.8 min_impurity_decrease: if value not equal to default value 0.0, accuracy equals to 0.5

1.9 min_impurity_split: with an increase in the value, the accuracy decreases, but after reaching a value of 0.5, it stops changing further and remains with an accuracy of 0.5