## Insert dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
data = pd.read_csv('C:\\Users\\yuvra\\OneDrive\\Desktop\\Train_Data.csv')
test=pd.read_csv('C:\\Users\\yuvra\\OneDrive\\Desktop\\Test_Data.csv')

## Splitting the dataset

In [3]:
y = data.revenue
X = data.drop(['revenue','campaign'], axis=1)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)
test=test.drop(['campaign'],axis=1)



In [5]:
X_train_full.head()

Unnamed: 0,date,adgroup,ad,impressions,clicks,cost,conversions
483,27-08-2020,adgroup 3,ad 13,398,188,62.03,7
982,26-09-2020,adgroup 3,ad 8,442,212,57.45,1
4101,08-02-2021,adgroup 1,ad 1,16,7,0.07,0
2829,19-12-2020,adgroup 3,ad 4,39,17,0.21,0
2627,11-12-2020,adgroup 3,ad 7,9,6,0.07,0


## Data preprocessing

In [6]:
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [7]:
my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = test[my_cols].copy()

In [8]:
X_train.head()

Unnamed: 0,adgroup,impressions,clicks,cost,conversions
483,adgroup 3,398,188,62.03,7
982,adgroup 3,442,212,57.45,1
4101,adgroup 1,16,7,0.07,0
2829,adgroup 3,39,17,0.21,0
2627,adgroup 3,9,6,0.07,0


## Encoding

In [9]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['adgroup']


In [10]:
from sklearn.preprocessing import OneHotEncoder
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
OH_cols_test.index = X_test.index

# Remove categorical columns (will replace with one-hot encoding)

num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

In [11]:
import pandas as pd
from sklearn.metrics import mean_absolute_error


In [16]:
target = rf_model.predict(OH_X_test)


res = pd.DataFrame(target) #preditcions are nothing but the final predictions of your model on input features of your new unseen test data
res.index = test.index # its important for comparison. Here "test_new" is your new test dataset
res.columns = ["revenue"]
res.to_csv("revenue.csv", index = False)      # the csv file will be saved locally on the same location where this notebook is located.

## Choose machine learning model

In [36]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
X, y = make_regression(n_features=10, n_informative=2,
                       random_state=0, shuffle=False)
regr = RandomForestRegressor(max_depth=8, random_state=0)
regr.fit(OH_X_train, y_train)

RandomForestRegressor(max_depth=8, random_state=0)

In [37]:
y1=regr.predict(OH_X_valid)

In [38]:
rf_val_mae = mean_absolute_error(y1, y_valid)
print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae))

Validation MAE for Random Forest Model: 67


In [39]:
y2=regr.predict(OH_X_test)

In [40]:
res = pd.DataFrame(y2) #preditcions are nothing but the final predictions of your model on input features of your new unseen test data
res.index = test.index # its important for comparison. Here "test_new" is your new test dataset
res.columns = ["revenue"]
res.to_csv("revenue.csv", index = False)      # the csv file will be saved locally on the same location where this notebook is located.