In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
ss = pd.read_csv("sample_sub.csv")


def preprocess_data(df):

    df["Item_Weight"].fillna(df.Item_Weight.mean(), inplace=True)
    

    df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({
        'Regular': 1, 'reg': 1, 'Low Fat': 0, 'low fat': 0, 'LF': 0
    }).astype(int)
    

    df['Item_Visibility'] = np.where(df['Item_Visibility'] == 0, df['Item_Visibility'].mean(), df['Item_Visibility'])
    

    df['Item_Identifier'] = df['Item_Identifier'].str.slice(0, 2)
    

    df['Years_of_Operation'] = 2013 - df['Outlet_Establishment_Year']
    
    return df

train = preprocess_data(train)
test = preprocess_data(test)


y_train = train['Item_Outlet_Sales']
X_train = train.drop(['Item_Outlet_Sales', 'Outlet_Establishment_Year'], axis=1)


le = LabelEncoder()
for col in X_train.select_dtypes(include=['object']):
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))


model_params = {
    'reg_lambda': 0.001061926310,  
    'max_bin': 322,
    'learning_rate': 0.011,

    'max_depth': 5,
    'random_state': 2020,
    'min_child_samples': 150,      
    'objective': 'regression',
    'n_estimators': 1000
}

model = LGBMRegressor(**model_params)
model.fit(X_train, y_train)


test_pred = model.predict(test[X_train.columns])


test_pred[test_pred < 33] = 33


ss['Item_Outlet_Sales'] = test_pred
ss.to_csv('bigmart_predictions.csv', index=False)

