In [1]:
import numpy as np
import numpy.random as random
import scipy as sp
from pandas import Series,DataFrame
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

import sklearn

%precision 3

'%.3f'

In [42]:
import requests,zipfile
import io

url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
res = requests.get(url).content

auto = pd.read_csv(io.StringIO(res.decode('utf-8')),header=None)

auto.columns = ['symboling','normalized-losses','make','fuel-type','aspiration','num-of-doors','body-style','drive-wheels','engine-location','wheel-base','length','width','height','curb-weight','engine-type','num-of-cylinders','engine-size','fule-system','bore','stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']

In [43]:
print('自動車データの形式：{}'.format(auto.shape))

自動車データの形式：(205, 26)


In [44]:
auto.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fule-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [45]:
auto = auto[['price','horsepower','width','height']]

In [46]:
auto.isin(['?']).sum()

price         4
horsepower    2
width         0
height        0
dtype: int64

In [47]:
auto = auto.replace('?',np.nan).dropna()
print('自動車データの形式：{}'.format(auto.shape))

自動車データの形式：(199, 4)


In [48]:
print('データ型の確認 \n{}\n'.format(auto.dtypes))

データ型の確認 
price          object
horsepower     object
width         float64
height        float64
dtype: object



In [49]:
auto = auto.assign(price=pd.to_numeric(auto.price))
auto = auto.assign(horsepower=pd.to_numeric(auto.horsepower))
print('データ型の確認 \n{}\n'.format(auto.dtypes))

データ型の確認 
price           int64
horsepower      int64
width         float64
height        float64
dtype: object



In [50]:
auto.corr()

Unnamed: 0,price,horsepower,width,height
price,1.0,0.810533,0.753871,0.13499
horsepower,0.810533,1.0,0.615315,-0.087407
width,0.753871,0.615315,1.0,0.309223
height,0.13499,-0.087407,0.309223,1.0


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X = auto.drop('price',axis=1)
y = auto['price']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,random_state=0)

model = LinearRegression()
model.fit(X_train,y_train)

print('決定係数(train):{:.3f}'.format(model.score(X_train,y_train)))
print('決定係数(test):{:.3f}'.format(model.score(X_test,y_test)))

print('\n回帰係数\n{}'.format(pd.Series(model.coef_,index=X.columns)))
print('切片:{:.3f}'.format(model.intercept_))

決定係数(train):0.733
決定係数(test):0.737

回帰係数
horsepower      81.651078
width         1829.174506
height         229.510077
dtype: float64
切片:-128409.046


In [16]:
auto = auto[['price','length','engine-size']]

In [17]:
auto.isin(['?']).sum()

price          4
length         0
engine-size    0
dtype: int64

In [18]:
auto = auto.replace('?',np.nan).dropna()
print('自動車データの形式：{}'.format(auto.shape))

自動車データの形式：(201, 3)


In [19]:
auto.corr()

Unnamed: 0,length,engine-size
length,1.0,0.685025
engine-size,0.685025,1.0


In [20]:
print('データ型の確認 \n{}\n'.format(auto.dtypes))

データ型の確認 
price           object
length         float64
engine-size      int64
dtype: object



In [21]:
auto = auto.assign(price=pd.to_numeric(auto.price))

In [22]:
auto.corr()

Unnamed: 0,price,length,engine-size
price,1.0,0.690628,0.872335
length,0.690628,1.0,0.685025
engine-size,0.872335,0.685025,1.0


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X = auto.drop('price',axis=1)
y = auto['price']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,random_state=0)

model = LinearRegression()
model.fit(X_train,y_train)

print('決定係数(train):{:.3f}'.format(model.score(X_train,y_train)))
print('決定係数(test):{:.3f}'.format(model.score(X_test,y_test)))

print('\n回帰係数\n{}'.format(pd.Series(model.coef_,index=X.columns)))
print('切片:{:.3f}'.format(model.intercept_))

決定係数(train):0.771
決定係数(test):0.763

回帰係数
length         179.544547
engine-size    120.273905
dtype: float64
切片:-33590.411


In [24]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
res = requests.get(url).content

adult = pd.read_csv(io.StringIO(res.decode('utf-8')),header=None)

adult.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','flg-50K']

print('データの形式：{}'.format(adult.shape))
print('欠損の数：{}'.format(adult.isnull().sum().sum()))

adult.head()

データの形式：(32561, 15)
欠損の数：0


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,flg-50K
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [25]:
adult.groupby('flg-50K').size()

flg-50K
 <=50K    24720
 >50K      7841
dtype: int64

In [27]:
adult['fin_flg'] = adult['flg-50K'].map(lambda x:1 if x == ' >50K' else 0)
adult.groupby('fin_flg').size()

fin_flg
0    24720
1     7841
dtype: int64

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X = adult[['age','fnlwgt','education-num','capital-gain','capital-loss']]
y = adult['fin_flg']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,random_state=0)

model = LogisticRegression()
model.fit(X_train,y_train)

print('正解率(train):{:.3f}'.format(model.score(X_train,y_train)))
print('正解率(test):{:.3f}'.format(model.score(X_test,y_test)))

正解率(train):0.797
正解率(test):0.798


In [30]:
model.coef_

array([[-1.185e-02, -4.379e-06, -2.774e-03,  3.274e-04,  7.532e-04]])

In [32]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X = adult[['age','fnlwgt','education-num','capital-gain','capital-loss']]
y = adult['fin_flg']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,random_state=0)

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

model = LogisticRegression()
model.fit(X_train_std,y_train)

print('正解率(train):{:.3f}'.format(model.score(X_train_std,y_train)))
print('正解率(test):{:.3f}'.format(model.score(X_test_std,y_test)))

正解率(train):0.811
正解率(test):0.810


In [35]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
cancer.columns = ['mean radius', 'mean texture', 'mean perimeter', 'mean area',
        'mean smoothness', 'mean compactness', 'mean concavity',
        'mean concave points', 'mean symmetry', 'mean fractal dimension',
        'radius error', 'texture error', 'perimeter error', 'area error',
        'smoothness error', 'compactness error', 'concavity error',
        'concave points error', 'symmetry error',
        'fractal dimension error', 'worst radius', 'worst texture',
        'worst perimeter', 'worst area', 'worst smoothness',
        'worst compactness', 'worst concavity', 'worst concave points',
        'worst symmetry', 'worst fractal dimension']

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X = cancer['data']
y = cancer['target']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,random_state=0)

model = LogisticRegression()
model.fit(X_train,y_train)

print('正解率(train):{:.3f}'.format(model.score(X_train,y_train)))
print('正解率(test):{:.3f}'.format(model.score(X_test,y_test)))

正解率(train):0.968
正解率(test):0.954


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [40]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,random_state=0)

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

model = LogisticRegression()
model.fit(X_train_std,y_train)

print('正解率(train):{:.3f}'.format(model.score(X_train_std,y_train)))
print('正解率(test):{:.3f}'.format(model.score(X_test_std,y_test)))

正解率(train):0.989
正解率(test):0.975


In [51]:
auto.head()

Unnamed: 0,price,horsepower,width,height
0,13495,111,64.1,48.8
1,16500,111,64.1,48.8
2,16500,154,65.5,52.4
3,13950,102,66.2,54.3
4,17450,115,66.4,54.3


In [53]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge,Lasso

X = auto.drop('price',axis=1)
y = auto['price']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,random_state=0)

linear = LinearRegression()
ridge = Ridge(random_state=0)
clf = Lasso(alpha=1,random_state=0)

for model in [linear,ridge,clf]:
    model.fit(X_train,y_train)
    print('{}(train):{:.6f}'.format(model.__class__.__name__,model.score(X_train,y_train)))
    print('{}(test):{:.6f}'.format(model.__class__.__name__,model.score(X_test,y_test)))


LinearRegression(train):0.733358
LinearRegression(test):0.737069
Ridge(train):0.733355
Ridge(test):0.737768
Lasso(train):0.733358
Lasso(test):0.737107
