In [1]:
%matplotlib inline

In [91]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris

from sklearn.preprocessing import PolynomialFeatures

from sklearn.linear_model import LinearRegression, RANSACRegressor, LogisticRegression
from sklearn.metrics import mean_squared_error

# Linear and Logistic Regression
## Live Demos

In [3]:
boston_data = pd.read_fwf(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data", header = None)

In [5]:
boston_data.shape

(506, 14)

In [7]:
boston_data.columns = ["crime_rate", "zoned_land", "industry", "bounds_river", "nox_conc", "rooms", "age", "distance",  "highways", "tax", "pt_ratio", "b_estimator", "pop_status", "price"]

In [8]:
boston_data.head()

Unnamed: 0,crime_rate,zoned_land,industry,bounds_river,nox_conc,rooms,age,distance,highways,tax,pt_ratio,b_estimator,pop_status,price
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [10]:
boston_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
crime_rate,506.0,1.71629,2.65351,0.00632,0.0819,0.250895,2.326718,9.96654
zoned_land,506.0,11.363636,23.322453,0.0,0.0,0.0,12.5,100.0
industry,506.0,11.136779,6.860353,0.46,5.19,9.69,18.1,27.74
bounds_river,506.0,0.06917,0.253994,0.0,0.0,0.0,0.0,1.0
nox_conc,506.0,0.554695,0.115878,0.385,0.449,0.538,0.624,0.871
rooms,506.0,6.284634,0.702617,3.561,5.8855,6.2085,6.6235,8.78
age,506.0,68.574901,28.148861,2.9,45.025,77.5,94.075,100.0
distance,506.0,3.696228,1.999689,0.5857,2.0737,3.1073,5.112625,9.2229
highways,506.0,4.332016,1.417166,1.0,4.0,4.0,5.0,8.0
tax,506.0,408.237154,168.537116,187.0,279.0,330.0,666.0,711.0


In [13]:
attributes = boston_data.drop("price", axis = 1)
target = boston_data.price

In [41]:
boston_data.corr()

Unnamed: 0,crime_rate,zoned_land,industry,bounds_river,nox_conc,rooms,age,distance,highways,tax,pt_ratio,b_estimator,pop_status,price
crime_rate,1.0,-0.300774,0.590822,0.013922,0.634679,-0.190197,0.482013,-0.495148,-0.088451,0.793392,0.362615,-0.377013,0.481907,-0.362077
zoned_land,-0.300774,1.0,-0.533828,-0.042697,-0.516604,0.311991,-0.569537,0.56666,-0.11929,-0.314563,-0.391679,0.17552,-0.412995,0.360445
industry,0.590822,-0.533828,1.0,0.062938,0.763651,-0.391676,0.644779,-0.678498,-0.087615,0.72076,0.383248,-0.356977,0.6038,-0.483725
bounds_river,0.013922,-0.042697,0.062938,1.0,0.091203,0.091251,0.086518,-0.09095,0.079105,-0.035587,-0.121515,0.048788,-0.053929,0.17526
nox_conc,0.634679,-0.516604,0.763651,0.091203,1.0,-0.302188,0.73147,-0.748872,0.009217,0.668023,0.188933,-0.380051,0.590879,-0.427321
rooms,-0.190197,0.311991,-0.391676,0.091251,-0.302188,1.0,-0.240265,0.225052,0.088753,-0.292048,-0.355501,0.128069,-0.613808,0.69536
age,0.482013,-0.569537,0.644779,0.086518,0.73147,-0.240265,1.0,-0.713313,0.019658,0.506456,0.261515,-0.273534,0.602339,-0.376955
distance,-0.495148,0.56666,-0.678498,-0.09095,-0.748872,0.225052,-0.713313,1.0,0.00303,-0.541369,-0.26914,0.293621,-0.479158,0.264325
highways,-0.088451,-0.11929,-0.087615,0.079105,0.009217,0.088753,0.019658,0.00303,1.0,-0.049221,-0.116969,0.040705,-0.069828,0.113519
tax,0.793392,-0.314563,0.72076,-0.035587,0.668023,-0.292048,0.506456,-0.541369,-0.049221,1.0,0.460853,-0.441808,0.543993,-0.468536


In [12]:
model = LinearRegression()

In [16]:
model.fit(attributes, target)

LinearRegression()

In [18]:
model.coef_

array([ 2.09281375e-01,  1.49403979e-02,  1.27164577e-02,  3.00565375e+00,
       -1.55234852e+01,  4.29955958e+00,  2.84848139e-03, -1.08366345e+00,
        1.93258621e-01, -2.42034372e-03, -9.65535221e-01,  9.43510233e-03,
       -5.25242783e-01])

In [19]:
model.intercept_

28.3051107500989

In [38]:
model.score(attributes, target)

0.7198065414937174

In [20]:
model_no_intercept = LinearRegression(fit_intercept = False)

In [21]:
model_no_intercept.fit(attributes, target)

LinearRegression(fit_intercept=False)

In [22]:
model_no_intercept.coef_

array([ 2.32266653e-02,  2.77104570e-02,  1.27058681e-02,  3.01506936e+00,
       -3.68298191e+00,  5.82571154e+00, -2.57508598e-03, -7.26850898e-01,
        3.10921428e-01, -3.14922442e-03, -5.07184355e-01,  1.39593084e-02,
       -4.43092704e-01])

In [37]:
model_no_intercept.intercept_

0.0

In [39]:
model_no_intercept.score(attributes, target)

0.7036175611789843

In [44]:
test_data = boston_data.sample(20)

In [45]:
test_attributes = test_data.drop("price", axis = 1)

In [46]:
model.predict(test_attributes)

array([17.24435394, 13.82098138, 21.8780933 , 14.34989721, 22.8631942 ,
        7.37787828, 18.64372574, 19.86454455, 27.2579925 , 15.30652453,
       23.68485379, 32.45104239, 26.21550713, 29.87454755, 15.50771871,
       12.23085729, 25.12810976, 33.72814848, 22.11565459, 26.76794828])

In [47]:
test_data["predicted_price"] = model.predict(test_attributes)

In [48]:
test_data

Unnamed: 0,crime_rate,zoned_land,industry,bounds_river,nox_conc,rooms,age,distance,highways,tax,pt_ratio,b_estimator,pop_status,price,predicted_price
401,4.2362,0.0,18.1,0,0.693,6.343,100.0,1.5741,4,666.0,20.2,396.9,20.32,7.2,17.244354
143,4.0974,0.0,19.58,0,0.871,5.468,100.0,1.4118,5,403.0,14.7,396.9,26.42,15.6,13.820981
71,0.15876,0.0,10.81,0,0.413,5.961,17.5,5.2873,4,305.0,19.2,376.94,9.88,21.7,21.878093
138,0.2498,0.0,21.89,0,0.624,5.857,98.2,1.6686,4,437.0,21.2,392.04,21.32,13.3,14.349897
44,0.12269,0.0,6.91,0,0.448,6.069,40.0,5.7209,3,233.0,17.9,389.39,9.55,21.2,22.863194
417,5.9406,0.0,18.1,0,0.679,5.304,89.1,1.6475,4,666.0,20.2,127.36,26.64,10.4,7.377878
451,5.44114,0.0,18.1,0,0.713,6.655,98.2,2.3552,4,666.0,20.2,355.29,17.73,15.2,18.643726
331,0.05023,35.0,6.06,0,0.4379,5.706,28.4,6.6407,1,304.0,16.9,394.02,12.43,17.1,19.864545
240,0.11329,30.0,4.93,0,0.428,6.897,54.3,6.3361,6,300.0,16.6,391.25,11.38,22.0,27.257992
467,4.42228,0.0,18.1,0,0.584,6.003,94.5,2.5403,4,666.0,20.2,331.29,21.32,19.1,15.306525


In [51]:
np.sqrt(mean_squared_error(test_data.price, test_data.predicted_price))

4.7340523069099225

In [53]:
ransac = RANSACRegressor(LinearRegression(), min_samples = 50, max_trials = 100, residual_threshold = 5.0)

In [54]:
ransac.fit(attributes, target)

RANSACRegressor(base_estimator=LinearRegression(), min_samples=50,
                residual_threshold=5.0)

In [56]:
ransac.estimator_.coef_

array([ 1.50407374e-01,  1.88087854e-03,  2.65900823e-02,  2.43587093e+00,
       -3.94919631e+00,  8.47888409e+00, -7.57725673e-02, -7.39598250e-01,
        4.07017460e-01, -7.92047201e-03, -7.51302037e-01,  1.64676357e-02,
       -6.72559341e-04])

In [55]:
ransac.score(attributes, target)

0.630944968424425

In [65]:
inlier_attributes = attributes[ransac.inlier_mask_]
inlier_target = target[ransac.inlier_mask_]

In [66]:
ransac.score(inlier_attributes, inlier_target)

0.9127380468655052

In [67]:
outlier_attributes = attributes[~ransac.inlier_mask_]
outlier_target = target[~ransac.inlier_mask_]

In [68]:
ransac.score(outlier_attributes, outlier_target)

0.25150195554093113

In [73]:
polynomial_transformer = PolynomialFeatures()

In [74]:
polynomial_transformer.fit(attributes)

PolynomialFeatures()

In [77]:
second_degree_attributes = polynomial_transformer.transform(attributes)

In [78]:
second_degree_attributes.shape

(506, 105)

In [82]:
second_degree_model = LinearRegression()

In [84]:
second_degree_model.fit(second_degree_attributes, target)

LinearRegression()

In [85]:
second_degree_model.score(second_degree_attributes, target)

0.8863751191596395

In [100]:
iris = load_iris()

In [101]:
iris_model = LogisticRegression(C = 1e9)

In [102]:
iris_model.fit(iris.data, iris.target)

LogisticRegression(C=1000000000.0)

In [103]:
iris_model.score(iris.data, iris.target)

0.9866666666666667

In [114]:
iris_poly = PolynomialFeatures(degree = 4).fit_transform(iris.data)

In [118]:
iris_poly.shape

(150, 70)

In [128]:
iris_model_poly = LogisticRegression()

In [129]:
iris_model_poly.fit(iris_poly.data, iris.target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [130]:
iris_model_poly.score(iris_poly, iris.target)

0.9866666666666667

In [131]:
iris_testing_data = iris.data[:10]

In [132]:
iris_model.predict(iris_testing_data)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [133]:
iris.target[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [134]:
iris_model.predict_proba(iris_testing_data)

array([[1.00000000e+00, 2.09849036e-31, 3.24053949e-58],
       [1.00000000e+00, 1.23433275e-24, 8.81061025e-50],
       [1.00000000e+00, 6.42931984e-28, 7.69223274e-54],
       [1.00000000e+00, 8.92914057e-23, 1.75765541e-47],
       [1.00000000e+00, 3.66133871e-32, 3.70905975e-59],
       [1.00000000e+00, 4.56465397e-31, 1.52474271e-56],
       [1.00000000e+00, 1.43611160e-27, 9.23659792e-53],
       [1.00000000e+00, 4.98676074e-28, 4.93460098e-54],
       [1.00000000e+00, 2.52600110e-21, 1.20632239e-45],
       [1.00000000e+00, 1.41306659e-24, 2.13277214e-50]])

In [135]:
iris_model.coef_

array([[  7.35271466,  20.39778454, -30.26348739, -14.14337754],
       [ -2.44376492,  -6.85843959,  10.41704506,  -2.07138612],
       [ -4.90894974, -13.53934495,  19.84644233,  16.21476366]])

In [136]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [None]:
"Iris-setosa" -> LabelEncoder