In [30]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


In [31]:
df_train = pd.read_csv("/Users/uzair/fractal/new/train.csv")
df_test  = pd.read_csv("/Users/uzair/fractal/new/test.csv")

In [32]:
df_train.shape

(15000, 15)

In [33]:
df_test.shape

(5000, 14)

In [34]:
df_train.replace('$$',np.nan,inplace=True)
df_train.replace('.',np.nan,inplace=True)
df_train.replace('__',np.nan,inplace=True)
df_train.replace('?',np.nan,inplace=True)
df_train.replace('blank',np.nan,inplace=True)
df_train.replace('1k',"1000",inplace=True)
df_train.dropna(inplace=True)

In [35]:
df_train.drop_duplicates(inplace=True)
df_test.drop_duplicates(inplace=True)
df_train.set_index("ID",inplace=True)

In [36]:
X_train = df_train.drop("Electricity_Consumption_AC",axis=1)
y_train = df_train['Electricity_Consumption_AC']

In [37]:
X_train.head()

Unnamed: 0_level_0,Type_of_AC,Model,Cooling_Capacity,Avg_Temp,Room_SIze,Num_People,Star_Rating,Age_of_Unit,Maintenance,Avg_Usage,Avg_AC_Temp,City,Input_Power
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
795823,Split,1.5,6579.15,43,1873.5,3,3.0,5.0,0,1.72,31.333333,Cairo,2863.2952214614106
506848,Window,1.1,6775.44,46,1603.8000000000002,4,4.0,1.0,0,18.0,29.5,Bucharest,2783.3096980924597
441027,Portable,3.8,6995.22,29,4807.0,38,4.0,1.0,0,0.551,13.763158,Berlin,2882.6064181342504
160677,Window,1.2,1863.68,39,1466.4,10,4.0,0.0,0,19.54,22.9,Cape Town,2607.089566348616
553568,Portable,2.2,6014.6,43,3113.0000000000005,6,4.0,6.0,1,17.286,18.166667,Moscow,2994.4946029176726


In [38]:
X_train.columns

Index(['Type_of_AC', 'Model', 'Cooling_Capacity', 'Avg_Temp', 'Room_SIze',
       'Num_People', 'Star_Rating', 'Age_of_Unit', 'Maintenance', 'Avg_Usage',
       'Avg_AC_Temp', 'City', 'Input_Power'],
      dtype='object')

In [39]:
numerical_cols = ['Model', 'Cooling_Capacity', 'Avg_Temp', 'Room_SIze',
       'Num_People', 'Star_Rating', 'Age_of_Unit', 'Maintenance', 'Avg_Usage',
       'Avg_AC_Temp', 'Input_Power']
categorical_cols = ["Type_of_AC","City"] 


In [40]:
numerical_xfer =Pipeline(steps=[("imputer",SimpleImputer(strategy='mean')),("scaler",StandardScaler())])

In [41]:
cat_xfer = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("onehot",OneHotEncoder(
            handle_unknown="ignore"
        ))
    ]
)

In [42]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num",numerical_xfer,numerical_cols),
        ("cat",cat_xfer,categorical_cols)
    ]
)

In [43]:
model = Pipeline(
    steps=[
        ("preprocessor",preprocessor),("regressor",LinearRegression(
        ))
    ]
)

In [44]:
model.fit(X_train,y_train)

In [45]:
X_test = df_test


In [46]:
X_test.shape

(5000, 14)

In [47]:
X_test.replace('$$',np.nan,inplace=True)
X_test.replace('.',np.nan,inplace=True)
X_test.replace('__',np.nan,inplace=True)
X_test.replace('?',np.nan,inplace=True)
X_test.replace('blank',np.nan,inplace=True)
X_test.replace('1k',"1000",inplace=True)
# X_test.dropna(inplace=True)

In [48]:
y_pred = model.predict(X_test)

In [49]:
prediction_df = pd.DataFrame({"ID":df_test['ID'],"Electricity_Consumption_AC":y_pred})

In [50]:
prediction_df.head(10)

Unnamed: 0,ID,Electricity_Consumption_AC
0,191220,2.091797
1,174111,1.711914
2,614605,2.166016
3,865325,1.84375
4,134921,2.166016
5,180720,1.817383
6,651968,2.187988
7,267316,2.618652
8,991959,2.353027
9,708804,2.568848


In [51]:
prediction_df.to_csv("prediction2.csv",index=False)