In [21]:
import pandas as pd
from sklearn.impute import KNNImputer

In [47]:
df = pd.read_csv("/content/train.csv")
df_test = pd.read_csv("/content/test.csv")
df.columns, df.shape, df_test.shape

(Index(['ID', 'Company', 'Quarter', 'QuickRatio', 'InventoryRatio',
        'RevenueGrowth', 'MarketshareChange', 'Bond rating', 'Stock rating',
        'Region', 'Industry', 'Sales'],
       dtype='object'),
 (675, 12),
 (150, 11))

In [23]:
df.describe()

Unnamed: 0,ID,QuickRatio,InventoryRatio,RevenueGrowth,MarketshareChange,Sales
count,675.0,675.0,523.0,675.0,675.0,525.0
mean,394.555556,1.603867,4.265124,-0.009733,-0.002904,3556.708571
std,204.960069,0.595615,3.108644,0.06739,0.017622,2028.059368
min,0.0,0.5,1.26,-0.2,-0.05,864.0
25%,216.5,0.99,2.63,-0.07,-0.015,1992.0
50%,433.0,1.73,3.42,0.0,0.0,3007.0
75%,579.0,2.155,4.725,0.05,0.01,4523.0
max,674.0,2.49,24.84,0.08,0.02,11686.0


In [24]:
df = df[['Company','QuickRatio', 'InventoryRatio',
        'RevenueGrowth', 'MarketshareChange', 'Bond rating', 'Stock rating',
        'Region', 'Industry', 'Sales']]

In [25]:
df_test = df_test[['Company', 'QuickRatio', 'InventoryRatio',
        'RevenueGrowth', 'MarketshareChange', 'Bond rating', 'Stock rating',
        'Region', 'Industry']]

In [26]:
df.head()

Unnamed: 0,Company,QuickRatio,InventoryRatio,RevenueGrowth,MarketshareChange,Bond rating,Stock rating,Region,Industry,Sales
0,CMP01,2.02,7.71,0.05,-0.04,CCC,Buy,South,Metal Fabrication,1517.0
1,CMP01,2.01,4.1,0.03,0.0,CCC,Hold,South,Metal Fabrication,2968.0
2,CMP01,2.02,6.79,0.06,-0.02,CCC,Buy,South,Metal Fabrication,1497.0
3,CMP01,1.98,3.97,0.01,0.02,CCC,Buy,South,Metal Fabrication,2929.0
4,CMP01,1.96,7.41,-0.07,0.02,CCC,Buy,South,Metal Fabrication,1452.0


In [27]:
from sklearn.preprocessing import OneHotEncoder
columns=['Company','Bond rating', 'Stock rating', 'Region', 'Industry']

encoder = OneHotEncoder(handle_unknown="ignore", sparse = False)


encoded_data = encoder.fit_transform(df[columns])


df_encoded = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(columns))


X_encoded = pd.concat([df.drop(columns=columns), df_encoded], axis=1)






In [28]:
data_test = df_test[columns]

In [29]:
test_encoded = encoder.transform(data_test)

In [30]:
test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(columns))

In [31]:
test_df = pd.concat([df_test.drop(columns=columns), test_encoded_df], axis=1)

In [32]:
X_encoded.shape

(675, 99)

In [33]:
X_encoded.isnull().sum()

QuickRatio                      0
InventoryRatio                152
RevenueGrowth                   0
MarketshareChange               0
Sales                         150
                             ... 
Region_South                    0
Region_West                     0
Industry_Automobile             0
Industry_Infrastructure         0
Industry_Metal Fabrication      0
Length: 99, dtype: int64

In [34]:
imputer = KNNImputer(n_neighbors=3)
df_encoded_clean = imputer.fit_transform(X_encoded)
X_imputed_df = pd.DataFrame(df_encoded_clean, columns=X_encoded.columns)

In [35]:
df_test_encoded_clean = imputer.fit_transform(test_df)

X_imputed_df_test = pd.DataFrame(df_test_encoded_clean, columns=test_df.columns)

In [36]:
X_imputed_df.isnull().sum(), X_imputed_df.shape

(QuickRatio                    0
 InventoryRatio                0
 RevenueGrowth                 0
 MarketshareChange             0
 Sales                         0
                              ..
 Region_South                  0
 Region_West                   0
 Industry_Automobile           0
 Industry_Infrastructure       0
 Industry_Metal Fabrication    0
 Length: 99, dtype: int64,
 (675, 99))

In [37]:
X_imputed_df_test.isnull().sum(), X_imputed_df_test.shape

(QuickRatio                    0
 InventoryRatio                0
 RevenueGrowth                 0
 MarketshareChange             0
 Company_CMP01                 0
                              ..
 Region_South                  0
 Region_West                   0
 Industry_Automobile           0
 Industry_Infrastructure       0
 Industry_Metal Fabrication    0
 Length: 98, dtype: int64,
 (150, 98))

In [38]:
Y = X_imputed_df['Sales']
X = X_imputed_df.drop(columns=['Sales'])

import numpy as np
X = np.asarray(X).astype(np.float32)
Y = np.asarray(Y).astype(np.float32)

In [39]:
Y.shape, X.shape

((675,), (675, 98))

In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [41]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import tensorflow as tf
from keras import backend as K

def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))

model = Sequential()
model.add(Dense(98, input_shape=(98,), activation='relu', kernel_regularizer='l2'))
model.add(Dense(60, activation='relu', kernel_regularizer='l2'))
model.add(Dense(20, activation='relu', kernel_regularizer='l2'))
model.add(Dense(1))

opt = tf.keras.optimizers.Adam(lr=0.01)
model.compile(loss=root_mean_squared_error, optimizer=opt)



In [42]:
from keras.callbacks import EarlyStopping
early_stopping_monitor = EarlyStopping(monitor='val_loss', patience=10)
model.fit(x=X, y=Y, validation_split = 0.1, batch_size=2, epochs=100, callbacks = [early_stopping_monitor])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100


<keras.src.callbacks.History at 0x7e73955b5570>

In [43]:
test_predictions = model.predict(X_test)



In [44]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(Y_test, test_predictions)
print("Mean Absolute Error (MAE) on the testing set:", mae)


Mean Absolute Error (MAE) on the testing set: 553.74664


In [50]:

train_predictions = model.predict(X_train)

mae_train = mean_absolute_error(Y_train, train_predictions)
print("Mean Absolute Error (MAE) on the training set:", mae_train)

Mean Absolute Error (MAE) on the training set: 616.35345


In [45]:
predictions = model.predict(X_imputed_df_test)



In [48]:
predictions = pd.DataFrame(predictions,columns=["Sales"])
result = pd.concat([df_test["ID"],predictions],axis=1)
result.to_csv("DL-6.csv",index=False)

In [49]:
submission = pd.read_csv("DL-6.csv")
submission

Unnamed: 0,ID,Sales
0,7,2813.6704
1,8,2354.6934
2,16,4526.0690
3,17,3829.7537
4,25,5985.7856
...,...,...
145,656,5636.7266
146,664,2913.5935
147,665,3215.0542
148,673,1847.7676
