In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

# Load datasets
test_df = pd.read_csv('/content/test.csv')
train_df = pd.read_csv('/content/train.csv')

# Drop columns
train_df = train_df.drop(["ID","Quarter"], axis=1)
train_df.dropna(subset=['Sales'], inplace=True)

# Define numerical and categorical columns
Numerical_Cols = ["InventoryRatio"]
Categorical_Cols = train_df.select_dtypes(include=['object']).columns

# Check unique categorical values in the training and test datasets
for col in Categorical_Cols:
    train_unique = set(train_df[col].unique())
    test_unique = set(test_df[col].unique())
    print(f"Training unique values for {col}: {train_unique}")
    print(f"Test unique values for {col}: {test_unique}")

    # Handle any differences between the training and test unique values here

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
            ('scaler', StandardScaler())  # Standardize numerical features
        ]), Numerical_Cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), Categorical_Cols)
    ])

# Train-test split
X_train, Y_train = train_df.drop(columns=["Sales"]), train_df["Sales"]
X_test = test_df.drop(columns=["ID", "Quarter"])

# Fit and transform the preprocessor on the training data
X_train_processed = preprocessor.fit_transform(X_train)

# Transform the test data using the fitted preprocessor
X_test_processed = preprocessor.transform(X_test)

# Define and fit the model
model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor())
    ])

model.fit(X_train, Y_train)

# Predict on the test data
predictions = model.predict(X_test)

# Save predictions to CSV
sales = pd.DataFrame(predictions, columns=["Sales"])
result = pd.concat([test_df['ID'], sales], axis=1)
result.to_csv("submission.csv", index=False)

# Read submission data
submission_data = pd.read_csv('submission.csv')
submission_data


Training unique values for Company: {'CMP26', 'CMP46', 'CMP17', 'CMP38', 'CMP03', 'CMP55', 'CMP67', 'CMP63', 'CMP34', 'CMP68', 'CMP57', 'CMP65', 'CMP08', 'CMP01', 'CMP02', 'CMP21', 'CMP47', 'CMP62', 'CMP69', 'CMP73', 'CMP13', 'CMP16', 'CMP28', 'CMP44', 'CMP42', 'CMP41', 'CMP30', 'CMP35', 'CMP45', 'CMP66', 'CMP72', 'CMP14', 'CMP29', 'CMP06', 'CMP20', 'CMP25', 'CMP39', 'CMP56', 'CMP61', 'CMP58', 'CMP31', 'CMP33', 'CMP15', 'CMP50', 'CMP23', 'CMP59', 'CMP53', 'CMP19', 'CMP49', 'CMP18', 'CMP32', 'CMP36', 'CMP52', 'CMP60', 'CMP11', 'CMP22', 'CMP27', 'CMP71', 'CMP74', 'CMP70', 'CMP24', 'CMP48', 'CMP75', 'CMP51', 'CMP12', 'CMP07', 'CMP04', 'CMP10', 'CMP05', 'CMP64', 'CMP43', 'CMP54', 'CMP37', 'CMP40', 'CMP09'}
Test unique values for Company: {'CMP26', 'CMP46', 'CMP17', 'CMP38', 'CMP03', 'CMP55', 'CMP67', 'CMP63', 'CMP34', 'CMP68', 'CMP57', 'CMP65', 'CMP08', 'CMP01', 'CMP02', 'CMP21', 'CMP47', 'CMP62', 'CMP69', 'CMP73', 'CMP13', 'CMP16', 'CMP28', 'CMP44', 'CMP42', 'CMP41', 'CMP30', 'CMP35', 'CM

Unnamed: 0,ID,Sales
0,7,2429.490000
1,8,2123.756667
2,16,4497.990000
3,17,3433.739333
4,25,4676.140000
...,...,...
145,656,5664.055000
146,664,2897.090000
147,665,2829.040000
148,673,2496.281667


In [None]:
train_score = model.score(X_train, Y_train)


print(f"Training R^2 score: {train_score:.2f}")

Training R^2 score: 0.94
