In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

rows = [['M. Dupé', 1, 25, 'Right', 8.0, 900.0],
       ['N. Fernández', 26, 18, 'Right', 1.0, 450.0],
       ['P. Kalambayi', 30, 18, 'Right', 1.0, 130.0],
       ['P. McNair', 17, 23, 'Right', 22.0, 2300.0],
       ['G. Bojanich', 23, 33, 'Right', 6.0, 425.0],
       ['A. Kofler', 31, 31, 'Right', 3.0, 325.0],
       ['N. Lavanchy', 14, 24, 'Right', 3.0, 600.0],
       ['O. Al Khalaf', 8, 21, 'Right', 3.0, 240.0],
       ['J. Sills', 21, 31, 'Right', 7.0, 600.0],
       ['B. Fox', 12, 20, 'Right', 1.0, 230.0],
       ['S. Smith', 9, 20, 'Left', 4.0, 450.0],
       ['E. Ocansey', 28, 20, 'Left', 5.0, 1600.0],
       ['F. Kostić', 10, 25, 'Left', 16.0, 10500.0],
       ['M. Ullmann', 13, 22, 'Left', 3.0, 1000.0],
       ['R. Taylor', 9, 30, 'Left', 4.0, 625.0],
       ['N. Vikonis', 34, 34, 'Left', 7.0, 2700.0],
       ['J. Aguirre', 29, 21, 'Left', 1.0, 575.0],
       ['J. Konings', 25, 20, 'Left', 1.0, 500.0],
       ['J. Raitala', 22, 29, 'Left', 3.0, 700.0],
       ['A. Taylor', 3, 31, 'Left', 3.0, 425.0]]
df = pd.DataFrame.from_records(rows, columns=["Name", "JerseyNumber", "Age", "PreferredFoot", "Wage", "Value"])
df.dtypes

Name              object
JerseyNumber       int64
Age                int64
PreferredFoot     object
Wage             float64
Value            float64
dtype: object

In [2]:
r = LinearRegression()
columns = ["JerseyNumber", "Age"]
r.fit(df[columns].values, df["Value"].values.reshape(-1,1))
print("Coef:", r.coef_)
print("Intercept:", r.intercept_)

Coef: [[-27.27295206  33.78800374]]
Intercept: [923.53888226]


In [5]:
def formula(reg, columns):
    rv = ""
    for i in range(len(columns)):
        rv += "{}*{} + ".format(reg.coef_[0,i], columns[i])
    rv += str(reg.intercept_[0])
    return rv

print("Value ~= " + formula(r, columns))

Value ~= -27.27295205695321*JerseyNumber + 33.788003740974744*Age + 923.5388822632224


# Part 2

In [9]:
def euclidean_dist(df, col1, col2):
    edist = ((df[col1] - df[col2]) ** 2).sum() ** 0.5
    return edist

def mean_squared_error(df, col1, col2):
    return (euclidean_dist(df, col1, col2) ** 2) / len(df)

In [10]:
df["p"] = r.predict(df[columns])

print("euclidean dist:", euclidean_dist(df, "Value", "p"))
print("MSE:", mean_squared_error(df, "Value", "p"))

df

euclidean dist: 9822.864215651065
MSE: 4824433.069955911


Unnamed: 0,Name,JerseyNumber,Age,PreferredFoot,Wage,Value,p
0,M. Dupé,1,25,Right,8.0,900.0,1740.966024
1,N. Fernández,26,18,Right,1.0,450.0,822.626196
2,P. Kalambayi,30,18,Right,1.0,130.0,713.534388
3,P. McNair,17,23,Right,22.0,2300.0,1237.022783
4,G. Bojanich,23,33,Right,6.0,425.0,1411.265108
5,A. Kofler,31,31,Right,3.0,325.0,1125.505484
6,N. Lavanchy,14,24,Right,3.0,600.0,1352.629643
7,O. Al Khalaf,8,21,Right,3.0,240.0,1414.903344
8,J. Sills,21,31,Right,7.0,600.0,1398.235005
9,B. Fox,12,20,Right,1.0,230.0,1272.023532
