In [2]:
import pandas as pd
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [4]:
df = pd.read_csv('/Users/Trisha/Desktop/Software Des/box-office-prediction/cleaned_data/data_clean_v2.csv')
for col in ['actor', 'actress', 'director', 'writer', 'genres']:
    df[col] = df[col].fillna('missing')

In [5]:
features = ['actor', 'actress', 'director', 'writer', 'runtimeMinutes', 'genres', 'isAdult']
target = 'averageRating'

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.3, random_state=42)

categorical_features_indices = [X_train.columns.get_loc(col) for col in ['actor', 'actress', 'director', 'writer', 'genres']]

In [11]:
model = cb.CatBoostRegressor(
    iterations=4000,
    learning_rate=0.4,
    depth=9,
    eval_metric='RMSE',
    cat_features=categorical_features_indices,
    use_best_model=True
)

model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100)


0:	learn: 0.8325703	test: 0.8284880	best: 0.8284880 (0)	total: 90.2ms	remaining: 6m
100:	learn: 0.5110903	test: 0.4935249	best: 0.4935249 (100)	total: 5.4s	remaining: 3m 28s
200:	learn: 0.4660215	test: 0.4510824	best: 0.4510824 (200)	total: 11.1s	remaining: 3m 29s
300:	learn: 0.4371056	test: 0.4258535	best: 0.4258535 (300)	total: 17.8s	remaining: 3m 38s
400:	learn: 0.4154361	test: 0.4082759	best: 0.4082759 (400)	total: 24.1s	remaining: 3m 36s
500:	learn: 0.3983396	test: 0.3957122	best: 0.3957122 (500)	total: 30.2s	remaining: 3m 30s
600:	learn: 0.3826137	test: 0.3843987	best: 0.3843987 (600)	total: 36.1s	remaining: 3m 24s
700:	learn: 0.3691253	test: 0.3756793	best: 0.3756793 (700)	total: 41.9s	remaining: 3m 17s
800:	learn: 0.3566976	test: 0.3670143	best: 0.3670143 (800)	total: 47.8s	remaining: 3m 10s
900:	learn: 0.3461277	test: 0.3615328	best: 0.3615328 (900)	total: 53.3s	remaining: 3m 3s
1000:	learn: 0.3372212	test: 0.3569681	best: 0.3569681 (1000)	total: 58.8s	remaining: 2m 56s
1100:	

<catboost.core.CatBoostRegressor at 0x169a3f2d0>

In [12]:

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 0.3062836421761508


In [13]:
from sklearn.metrics import r2_score

y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')


R-squared: 0.9046456034645315


In [15]:
hypothetical_movie = {
    'actor': 'Robert Downey Jr.',
    'actress': '',
    'director': '',
    'writer': '',
    'runtimeMinutes': 150,
    'genres': 'Sci-Fi',
    'isAdult': 0
}

new_data = pd.DataFrame([hypothetical_movie])

predicted_rating = model.predict(new_data)
print(f'Predicted Rating: {predicted_rating[0]}')


Predicted Rating: 6.538266393766316


In [None]:
model.save_model("CatBoosy")