In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error as mae
from sklearn.decomposition import PCA
import math
from sys import stdout
%matplotlib inline

def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5

#открыть данные

filename = 'C:/anaconda/train.csv'
data = pd.read_csv(filename)
#data.head(5)

#избавиться от отсутствующих

data = data.fillna(data.median(axis=0), axis=0)

categorical_columns = [c for c in data.columns if data[c].dtype.name == 'object']
numerical_columns   = [c for c in data.columns if data[c].dtype.name != 'object']
data_describe = data.describe(include=[object])
for c in categorical_columns:
    data[c] = data[c].fillna(data_describe[c]['top'])

#преобразование в количественные
    
binary_columns    = [c for c in categorical_columns if data_describe[c]['unique'] == 2]
nonbinary_columns = [c for c in categorical_columns if data_describe[c]['unique'] > 2]

#print(binary_columns)
#print(nonbinary_columns)

#bin

data_describe = data.describe(include=[object])

for c in binary_columns:
    top = data_describe[c]['top']
    top_items = data[c] == top
    data.loc[top_items, c] = 0
    data.loc[np.logical_not(top_items), c] = 1
       
#nonbin 

data_nonbinary = pd.get_dummies(data[nonbinary_columns])
#print(data_nonbinary.columns)

#нормализэйшн

data_numerical = data[numerical_columns]
data_numerical = (data_numerical - data_numerical.mean()) / data_numerical.std()
data_numerical.describe()

#делаем новую таблицу с переделанными данными

data = pd.concat((data_numerical, data[binary_columns], data_nonbinary), axis=1)
data = pd.DataFrame(data, dtype=float)
#print(data.shape)
#print(data.columns)

X = data.drop(('SalePrice'), axis=1)  # Выбрасываем столбец 'SalePrice'.
y = data['SalePrice']
feature_names = X.columns

#метод главных компонент

pca = PCA(n_components = 5)
XPCAreduced = pca.fit_transform(X)
#print(XPCAreduced)

#print(feature_names)

#обраотка данных на тренировочную и тестовую 

X_train, X_test, y_train, y_test = train_test_split(XPCAreduced, y, test_size = 0.3, random_state = 11)

N_train, _ = X_train.shape 
N_test,  _ = X_test.shape 
#print(N_train, N_test)

#реализация библиотечного

lr = LinearRegression()
lr.fit(X_train, y_train)

y_train_predict = lr.predict(X_train)
y_test_predict = lr.predict(X_test)

#print(y_train_predict)
#print(y_test_predict)

print("sklearn")
print("MAE: ", mae(y_test, y_test_predict))
print("RMSE: ", rmsle(y_test, y_test_predict))

#реализация ручками

def predict_outcome(feature_matrix, weights):
    weights=np.array(weights)
    predictions = np.dot(feature_matrix, weights)
    return predictions

def errors(output,predictions):
    errors=predictions-output
    return errors

def feature_derivative(errors, feature):
    derivative=np.dot(2,np.dot(feature,errors))
    return derivative


def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    #Начальные веса преобразуются в массив numpy
    weights = np.array(initial_weights)
    while not converged:
        # вычислить прогнозы на основе feature_matrix и весов:
        predictions=predict_outcome(feature_matrix,weights)
        # вычислять ошибки как predictions - output:
        error=errors(output,predictions)
        gradient_sum_squares = 0 # инициализирование градиента
        # пока не сходится, обновлять каждый вес отдельно:
        for i in range(len(weights)):
            # Вызов feature_matrix[:, i] если столбец фич связан с весами[i]
            feature=feature_matrix[:, i]
            deriv=feature_derivative(error,feature)
            # добавить квадратную производную к величине градиента
            gradient_sum_squares=gradient_sum_squares+(deriv**2)
            # обновить вес на основе размера шага и производной:
            weights[i]=weights[i] - np.dot(step_size,deriv)

        gradient_magnitude = math.sqrt(gradient_sum_squares)
        stdout.write("\r%d" % int(gradient_magnitude))
        stdout.flush()
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

simple_feature_matrix = XPCAreduced
output = y
initial_weights = np.array([0.1, 0.001, 0.001, 0.001, 0.001])
step_size = 0.00001
tolerance = 2.5e7
simple_weights = regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)
#print(simple_weights)

hand_y_train_predict = np.dot(X_train, simple_weights)
hand_y_test_predict = np.dot(X_test, simple_weights)

print("hands")
print("MAE: ", mae(y_test, hand_y_test_predict))
print("RMSE: ", rmsle(y_test, hand_y_test_predict))

sklearn
MAE:  0.29166092309823005
RMSE:  0.5146755140764067
4897hands
MAE:  0.42886669557810053
RMSE:  0.6694882190819695


  from ipykernel import kernelapp as app


In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#from sklearn.liner_model import LinearRegression
#from sklearn.model_selection import train_test_split
#from sklearn.preproccessing import MinMaxScaler
#from sklearn.metrics import mean_absolute_error
%matplotlib inline

def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5

filename = 'C:/anaconda/train.csv'
data = pd.read_csv(filename)
data.head(20)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000
