In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.linear_model import LinearRegression, Lasso, LassoCV

# Load data

First, we concatenate training and test set to simplify coding required for feature transformation. After transformation, data is again separated to train and test sets.

In [4]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [5]:
df_total = pd.concat([df_train, df_test])

# Transform

## 1. Support function

In [8]:
def add_dummies(df, ex=[]):
    ''' Convert categorical variables into a set of dummy variables which 
        hold binary value. One dummy per category will be added to output 
        dataframe except the NaN value. If original column has no NaN value, 
        one category will be omitted to avoid multi-collinearity problem.
    
        param df: input data frame
        param ex (list of str): list of excluded columns
        
        return result: output data frame
    '''
    def is_category(column_name):
        return df[column_name].dtype=='O'
    
    def get_dummy_vars(column_name):
        # to make sure NaN value is the last element and exclude it
        return df[col].drop_duplicates().sort_values().to_list()[:-1]
    
    result = df.copy()
    
    for col in tqdm(df.columns):
        if col in ex or not is_category(col):
            continue
        else:           
            for v in get_dummy_vars(col):
                # add column for each distinct value of column
                result[str(col) + '.' + str(v)] = result.apply(lambda row: (row[col]==v and row[col]==row[col])*1, axis=1)
            
            # drop original column
            result = result.drop(columns=[col])
    return result

In [23]:
def checkna(dataframe):
    """ print column names in dataframe which have NaN value
    """
    for c in dataframe.columns:
        if dataframe[c].isnull().values.any():
            print(f'{c}: {dataframe[c].drop_duplicates().to_list()}')

## 2. Transform

In [21]:
df_dummy = add_dummies(df_total)

100%|███████████████████████████████████████████████████████████████████████████████████| 81/81 [00:08<00:00,  9.89it/s]


In [25]:
# checkna(df_dummy)

In [28]:
df_dummy = df_dummy.fillna(0)
# checkna(df_dummy)

In [12]:
checkna(df_dummy)

## 3. Cross-valiation lasso

In [30]:
model = LassoCV(cv=10, max_iter=10000)

In [31]:
# split data
y_train = df_dummy.iloc[:1460]['SalePrice'].to_numpy()
X_train = df_dummy.iloc[:1460].drop(columns=['SalePrice', 'Id']).to_numpy()
y_test = df_dummy.iloc[1460:]['SalePrice'].to_numpy()
X_test = df_dummy.iloc[1460:].drop(columns=['SalePrice', 'Id']).to_numpy()

In [32]:
model.fit(X_train,y_train)

In [33]:
model.score(X_train,y_train)

0.7236162334166498

## 4. Prediction

In [34]:
y_pred = model.predict(X_test)

In [35]:
prediction = pd.concat([df_dummy[['Id']].iloc[1460:], pd.DataFrame(y_pred, columns=['SalePrice'])], axis=1)

In [36]:
prediction.to_csv('submission.csv')

In [37]:
prediction

Unnamed: 0,Id,SalePrice
0,1461,144394.020245
1,1462,173123.700823
2,1463,199496.579557
3,1464,198942.948668
4,1465,171596.115137
...,...,...
1454,2915,88922.102005
1455,2916,110616.750882
1456,2917,191376.367528
1457,2918,110341.837753
