In [1]:
import pandas as pd
import sklearn.model_selection
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.linear_model
import sklearn.metrics
import sklearn.tree
import sklearn.ensemble

In [2]:
def preproc(X):
    """Предварительно преобразовывает"""
    
    #Замена str на числа
    X.loc[X["Ecology_2"] == "A", "Ecology_2"] = 1
    X.loc[X["Ecology_2"] == "B", "Ecology_2"] = 2
    X.loc[X["Ecology_3"] == "A", "Ecology_3"] = 1
    X.loc[X["Ecology_3"] == "B", "Ecology_3"] = 2
    X.loc[X["Shops_2"] == "A", "Shops_2"] = 1
    X.loc[X["Shops_2"] == "B", "Shops_2"] = 2

In [3]:
def fit(X, y):
    """Вычисляет параметры"""
    
    def set_param_price(param, X, params):
        """Добавляет в параметры среднее значение цены по разным значениям фичи param"""
        
        df = X.copy()
        df["Price"] = y
        df = df.loc[:, [param, "Price"]]
        params[f"med_price_{param.lower()}"] = df.groupby([param]).median().rename(columns={"Price": f"MedPrice{param}"})
    
    #Параметры для функции transform()
    params = {
        "medians": {}, 
        "Healthcare_1": [], 
        "distr_size": pd.DataFrame(),
        "med_price_rooms": pd.DataFrame(),
        "med_price_floor": pd.DataFrame()
    }
    
    for feature in X.columns:
        params["medians"][feature] = X.loc[X[feature].notna(), feature].median()
    
    #Вычисляет средние значения Healthcare_1 по разным группам значений фичи Social_1
    for i in range(7):
        params["Healthcare_1"].append(X.loc[(X["Healthcare_1"].notna()) 
                                                  & (i*10 <= X["Social_1"]) 
                                                  & (X["Social_1"] < (i+1)*10), "Healthcare_1"].median())
    params["Healthcare_1"].append(params["Healthcare_1"][-1])
    
    params["distr_size"] = pd.DataFrame(X["DistrictId"].value_counts()).reset_index().rename(columns={"index": "DistrictId", "DistrictId": "DistrictSize"})
    
    set_param_price("Rooms", X, params)
    set_param_price("Floor", X, params)
    set_param_price("Ecology_2", X, params)
    set_param_price("Ecology_3", X, params)
    set_param_price("Helthcare_2", X, params)
    set_param_price("Shops_2", X, params)
    
    return params

In [4]:
def transform(X, params):
    """Трансформирует данные"""
    
    #Устранение NaN
    X['LifeSquare_nan'] = X['LifeSquare'].isna() * 1
    X['Healthcare_1_nan'] = X['Healthcare_1'].isna() * 1
    X.loc[X["LifeSquare"].isna(), "LifeSquare"] = X.loc[X["LifeSquare"].isna(), "Square"] - X.loc[X["LifeSquare"].isna(), "KitchenSquare"] - 3
    for i in range(8):
        X.loc[(X["Healthcare_1"].isna()) & (i*10 <= X["Social_1"]) 
              & (X["Social_1"] < (i+1)*10), "Healthcare_1"] = params["Healthcare_1"][i]
    
    #Устранение выбросов
    X.loc[X["Rooms"] == 0, "Rooms"] = 1
    X.loc[X["Rooms"] > 7, "Rooms"] = params["medians"]["Rooms"]
    X.loc[:, "corr_Rooms"] = 0
    X.loc[(X["Rooms"] == 0) | (X["Rooms"] > 7), "corr_Rooms"] = 1
    X.loc[(X["Square"] > 200) | (X["Square"] < 15), "Square"] = params["medians"]["Square"]
    X.loc[:, "corr_Square"] = 0
    X.loc[(X["Square"] > 200) | (X["Square"] < 15), "corr_Square"] = 1
    X.loc[(X["LifeSquare"] > 200) | (X["LifeSquare"] < 10), "LifeSquare"] = params["medians"]["LifeSquare"]    
    X.loc[:, "corr_LifeSquare"] = 0
    X.loc[(X["LifeSquare"] > 200) | (X["LifeSquare"] < 10), "corr_LifeSquare"] = 1
    X.loc[X["KitchenSquare"] < 2, "KitchenSquare"] = 2
    X.loc[X["KitchenSquare"] > 20, "KitchenSquare"] = params["medians"]["KitchenSquare"]
    X.loc[:, "corr_KitchenSquare"] = 0
    X.loc[(X["KitchenSquare"] > 20) | (X["KitchenSquare"] < 2), "corr_KitchenSquare"] = 1
    X.loc[X["Floor"] == 0, "Floor"] = 1
    X.loc[X["Floor"] > 40, "Floor"] = params["medians"]["Floor"]
    X.loc[:, "corr_Floor"] = 0
    X.loc[(X["Floor"] > 40) | (X["Floor"] == 0), "corr_Floor"] = 1
    X.loc[X["HouseFloor"] == 0, "HouseFloor"] = 1
    X.loc[X["HouseFloor"] > 40, "HouseFloor"] = params["medians"]["HouseFloor"]
    X.loc[:, "corr_HouseFloor"] = 0
    X.loc[(X["HouseFloor"] > 40) | (X["HouseFloor"] == 0), "corr_HouseFloor"] = 1
    X.loc[(X["HouseYear"] < 1920), "HouseYear"] = params["medians"]["HouseYear"]
    X.loc[(X["HouseYear"] > 2020), "HouseYear"] = 2020
    X.loc[:, "corr_HouseYear"] = 0
    X.loc[(X["HouseYear"] > 2021) | (X["HouseYear"] < 1920), "corr_HouseYear"] = 1
    
    #Проверка, что этаже не выше этажности дома
    X.loc[X["LifeSquare"] > X["Square"], "LifeSquare"] = X.loc[X["LifeSquare"] > X["Square"], "Square"] - X.loc[X["LifeSquare"] > X["Square"], "KitchenSquare"] - 3
    
    #Добавление размера района
    X = X.merge(data_params["distr_size"], on="DistrictId", how="left")
    X.loc[X["DistrictSize"].isna(), "DistrictSize"] = 5
    
    #Добавление средних значений цен для разных значений разных фичей
    X = X.merge(params["med_price_rooms"], on="Rooms", how="left")
    X.loc[X["MedPriceRooms"].isna(), "MedPriceRooms"] = 1
    X = X.merge(params["med_price_floor"], on="Floor", how="left")
    X.loc[X["MedPriceFloor"].isna(), "MedPriceFloor"] = 1
    X = X.merge(params["med_price_ecology_2"], on="Ecology_2", how="left")
    X.loc[X["MedPriceEcology_2"].isna(), "MedPriceEcology_2"] = 1
    X = X.merge(params["med_price_ecology_3"], on="Ecology_3", how="left")
    X.loc[X["MedPriceEcology_3"].isna(), "MedPriceEcology_3"] = 1
    X = X.merge(params["med_price_helthcare_2"], on="Helthcare_2", how="left")
    X.loc[X["MedPriceHelthcare_2"].isna(), "MedPriceHelthcare_2"] = 1
    X = X.merge(params["med_price_shops_2"], on="Shops_2", how="left")
    X.loc[X["MedPriceShops_2"].isna(), "MedPriceShops_2"] = 1
    
    X = X.loc[:, X.columns[1:]]
    
    return X

In [5]:
def choose_model(model_name, params={}):
    """Возвращает заданную модель с заданными параметрами"""
    
    if model_name == "lr":
        model = sklearn.linear_model.LinearRegression()
    elif model_name == "tr":
        model = sklearn.tree.DecisionTreeRegressor(**params)
    elif model_name == "rf":
        model = sklearn.ensemble.RandomForestRegressor(**params)
    elif model_name == "gb":
        model = sklearn.ensemble.GradientBoostingRegressor(**params)
    else:
        raise Exception("Wrong model name")
    
    return model

In [6]:
def single_model(model_name, X_train, y_train, X_test, y_test=None, params={}):
    """Обучает заданную модель с заданными параметрами на заданных значениях и выводит оценку R^2"""
    
    model = choose_model(model_name, params=params)
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    if y_test is not None:   
        
        y_train_pred = model.predict(X_train)
        
        r2_train = sklearn.metrics.r2_score(y_train, y_train_pred)
        r2_test = sklearn.metrics.r2_score(y_test, y_pred)
        
        print(f"Тренировочное значение: {r2_train}\nТестовое значение: {r2_test}")
    
    return y_pred

In [7]:
train_full = pd.read_csv("train.csv")

#### Разбиение данных

In [8]:
X_train, X_test = sklearn.model_selection.train_test_split(train_full, test_size=0.3, random_state=50)
y_train = X_train["Price"]
y_test = X_test["Price"]
X_train = X_train.loc[:, X_train.columns[:-1]]
X_test = X_test.loc[:, X_test.columns[:-1]]

In [9]:
X_train

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
5189,4842,5,2.0,44.408977,26.716904,6.0,2,5.0,1966,0.150818,B,B,16,3433,4,2643.0,4,5,B
7969,3791,45,2.0,66.243833,,0.0,16,0.0,1977,0.195781,B,B,23,5212,6,,3,2,B
9039,1270,30,2.0,66.486158,,1.0,16,17.0,1977,0.000078,B,B,22,6398,141,1046.0,3,23,B
5208,4050,193,2.0,46.034264,33.918107,5.0,4,5.0,1966,0.319809,B,B,25,4756,16,2857.0,5,8,B
506,12624,78,3.0,61.750846,48.029672,6.0,5,9.0,1972,0.092291,B,B,21,4346,2,165.0,1,2,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8262,10245,87,3.0,64.184158,42.093763,9.0,5,9.0,1981,0.100456,B,B,43,7227,0,,1,6,A
6214,14482,168,3.0,73.950531,46.192409,8.0,9,9.0,1962,0.341072,B,B,27,5664,48,2300.0,3,11,B
8324,9364,6,1.0,41.585376,,1.0,19,17.0,2014,0.243205,B,B,5,1564,0,540.0,0,0,B
6253,13184,37,1.0,40.306909,23.896674,7.0,5,17.0,1989,0.178917,B,B,18,3594,4,192.0,1,5,B


In [10]:
preproc(X_train)
preproc(X_test)
data_params = fit(X_train, y_train)
X_train = transform(X_train, data_params)
X_test = transform(X_test, data_params)

In [11]:
single_model("lr", X_train, y_train, X_test, y_test)

Тренировочное значение: 0.5778772788125875
Тестовое значение: 0.5840218041910353


array([116133.34317813, 415579.22520856, 158919.15813132, ...,
       125312.82533762, 268269.02886741, 120253.34424245])

In [12]:
model_params = {"criterion": "squared_error", "max_depth": 5, 
                "min_samples_leaf": 3, "random_state": 50}
single_model("tr", X_train, y_train, X_test, y_test, params=model_params)

Тренировочное значение: 0.661363792295282
Тестовое значение: 0.6377581748229175


array([118351.25872854, 485696.07378313, 184206.30158601, ...,
       143621.26815803, 219676.01815395, 118351.25872854])

In [13]:
model_params = {"criterion": "squared_error", "max_depth": 10, 
                "n_estimators": 20, "min_samples_leaf": 25, "random_state": 50}
single_model("rf", X_train, y_train, X_test, y_test, params=model_params)

Тренировочное значение: 0.7425932531784927
Тестовое значение: 0.7123761119819592


array([130513.00986046, 428955.76977631, 182440.09686915, ...,
       146085.31649033, 220093.92239388, 116926.47186348])

In [14]:
model_params = {"criterion": "squared_error", "max_depth": 3, 
                "n_estimators": 200, "min_samples_leaf": 50, "random_state": 50}
single_model("gb", X_train, y_train, X_test, y_test=y_test, params=model_params)

Тренировочное значение: 0.783383597122526
Тестовое значение: 0.7518181074955655


array([123528.80880476, 420305.69181054, 196905.69635652, ...,
       146165.24838186, 202548.95618117, 113812.73645489])

In [15]:
test_full = pd.read_csv("test.csv")

In [16]:
preproc(test_full)
test_full = transform(test_full, data_params)

In [17]:
result = single_model("gb", X_train, y_train, test_full, params=model_params)

In [18]:
sample = pd.read_csv("sample_submission.csv")

In [19]:
sample["Price"] = result

In [20]:
sample.to_csv("changes1_gb_sample.csv", index=False)