In [1]:
import numpy as np
import pandas as pd
from linear_regression import LinearRegression

In [2]:
np.random.seed(0)

dataset = pd.read_csv("FishMarket.csv")
dataset = dataset.sample(frac=1).reset_index(drop=True)
dataset

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,390.0,27.6,30.0,35.0,12.6700,4.6900
1,Roach,105.0,19.0,20.5,22.8,6.4752,3.3516
2,Perch,170.0,21.5,23.5,25.0,6.2750,3.7250
3,Roach,160.0,20.5,22.5,25.3,7.0334,3.8203
4,Perch,556.0,32.0,34.5,36.5,10.2565,6.3875
...,...,...,...,...,...,...,...
154,Bream,500.0,28.5,30.7,36.2,14.2266,4.9594
155,Perch,260.0,25.4,27.5,28.9,7.1672,4.3350
156,Parkki,170.0,19.0,20.7,23.2,9.3960,3.4104
157,Perch,650.0,36.5,39.0,41.4,11.1366,6.0030


In [3]:
dataset["Species"].value_counts()

Perch        56
Bream        35
Roach        20
Pike         17
Smelt        14
Parkki       11
Whitefish     6
Name: Species, dtype: int64

In [4]:
species = set(dataset["Species"])
fish_id = 1
new_dataset = dataset.copy()
for i in species:
    new_dataset.loc[new_dataset['Species'] == i, 'Species'] = fish_id * 100
    fish_id += 1

In [5]:
new_dataset = new_dataset.astype('float')
train_fish = new_dataset[:120]
test_fish = new_dataset[120:]

In [6]:
new_dataset

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,500.0,390.0,27.6,30.0,35.0,12.6700,4.6900
1,300.0,105.0,19.0,20.5,22.8,6.4752,3.3516
2,700.0,170.0,21.5,23.5,25.0,6.2750,3.7250
3,300.0,160.0,20.5,22.5,25.3,7.0334,3.8203
4,700.0,556.0,32.0,34.5,36.5,10.2565,6.3875
...,...,...,...,...,...,...,...
154,500.0,500.0,28.5,30.7,36.2,14.2266,4.9594
155,700.0,260.0,25.4,27.5,28.9,7.1672,4.3350
156,400.0,170.0,19.0,20.7,23.2,9.3960,3.4104
157,700.0,650.0,36.5,39.0,41.4,11.1366,6.0030


In [7]:
train_fish["Species"].value_counts()

700.0    42
500.0    25
300.0    16
100.0    14
200.0    10
400.0     9
600.0     4
Name: Species, dtype: int64

In [8]:
test_fish["Species"].value_counts()

700.0    14
500.0    10
200.0     4
300.0     4
100.0     3
400.0     2
600.0     2
Name: Species, dtype: int64

In [9]:
X_train_fish = np.array(train_fish.loc[:, train_fish.columns != 'Weight'])
y_train_fish_true = np.array(train_fish.loc[:, train_fish.columns == 'Weight'])

X_test_fish = np.array(test_fish.loc[:, test_fish.columns != 'Weight'])
y_test_fish_true = np.array(test_fish.loc[:, test_fish.columns == 'Weight'])

In [10]:
model = LinearRegression()
model.fit(X_train_fish, y_train_fish_true)

In [11]:
y_test_fish_pred = model.predict(X_test_fish)

In [12]:
r_score = model.r_score(y_test_fish_true, y_test_fish_pred)
print(r_score)

0.905758787281479


In [13]:
model.coefficients()

array([[-1.44563304e-01],
       [ 2.67285516e+01],
       [ 4.46982890e+01],
       [-4.78057929e+01],
       [ 3.00675732e+01],
       [ 4.01855246e+01],
       [-4.63890224e+02]])