# Linear Regression (housing.csv)

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing, linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [34]:
f = pd.read_csv('housing.csv', header=None, delim_whitespace=True)
print(f.head())

        0     1     2   3      4      5     6       7   8      9     10  \
0  0.00632  18.0  2.31   0  0.538  6.575  65.2  4.0900   1  296.0  15.3   
1  0.02731   0.0  7.07   0  0.469  6.421  78.9  4.9671   2  242.0  17.8   
2  0.02729   0.0  7.07   0  0.469  7.185  61.1  4.9671   2  242.0  17.8   
3  0.03237   0.0  2.18   0  0.458  6.998  45.8  6.0622   3  222.0  18.7   
4  0.06905   0.0  2.18   0  0.458  7.147  54.2  6.0622   3  222.0  18.7   

       11    12    13  
0  396.90  4.98  24.0  
1  396.90  9.14  21.6  
2  392.83  4.03  34.7  
3  394.63  2.94  33.4  
4  396.90  5.33  36.2  


In [36]:
f_numpy = np.array(f)
housing_x = f_numpy[:,:13]
housing_y = f_numpy[:,13:]

# train-test split
housing_x_train, housing_x_test, housing_y_train, housing_y_test = train_test_split(housing_x, housing_y, test_size=0.1, random_state=1)
# z-scope normalization
scaler = preprocessing.StandardScaler().fit(housing_x_train)
housing_x_train = scaler.transform(housing_x_train)

# build linear regression model
model = linear_model.LinearRegression()
# train the model
model.fit(housing_x_train, housing_y_train)

# make prediction
housing_x_test = scaler.transform(housing_x_test)
housing_y_pred = model.predict(housing_x_test)

# mean squared error
print(f'Mean squared error: {mean_squared_error(housing_y_test, housing_y_pred)}')
print(f'Variance score: {r2_score(housing_y_test, housing_y_pred)}')

Mean squared error: 20.544274659325744
Variance score: 0.778638658029892


# Polynomial Regression (wine-quality)

In [49]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing

In [43]:
f = pd.read_csv('winequality-red.csv')
print(f.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [53]:
x = f.drop(['quality'], axis=1)
y = f['quality']

poly = PolynomialFeatures(degree=2).fit(x)
x_poly = poly.transform(x)

# train-test split
x_train, x_test, y_train, y_test = train_test_split(x_poly, y, test_size=0.2)

# training data normalization
scaler = preprocessing.StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)

# build model
model = linear_model.LinearRegression()
model.fit(x_train, y_train)

# Testing data normalization
x_test = scaler.transform(x_test)

# predict y
y_pred = model.predict(x_test)

# evaluation
print(f'Mean squared error: {mean_squared_error(y_test, y_pred)}')
print(f'Variance score: {r2_score(y_test, y_pred)}')

Mean squared error: 0.45767840419918243
Variance score: 0.3161702985336503


# Logistic Regression (diabetes)

In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix

In [4]:
f = pd.read_csv('pima-indians-diabetes.csv')
print(f.head())

   pregnant  glucose  bp  skin  insulin   bmi  pedigree  age  label
0         6      148  72    35        0  33.6     0.627   50      1
1         1       85  66    29        0  26.6     0.351   31      0
2         8      183  64     0        0  23.3     0.672   32      1
3         1       89  66    23       94  28.1     0.167   21      0
4         0      137  40    35      168  43.1     2.288   33      1


In [10]:
x = f[['pregnant','insulin','bmi','age']]
y = f['label']

# train-test-split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

# training data normalization
scaler = preprocessing.StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)

# build model
model = LogisticRegression()
model.fit(x_train, y_train)

# testing data normalization
x_test = scaler.transform(x_test)

# predict y
y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.7142857142857143
