In [None]:
import pandas as pd
import joblib

data = pd.read_csv("../data/final_movie_data.csv")


In [None]:

columns_to_keep = ['adult', 'budget', 'genres_1_name', 'popularity', 'revenue', 'runtime', 
                   'vote_average', 'vote_count']

data = data[columns_to_keep]


In [None]:
numerical_cols = data.select_dtypes(include=['number']).columns.tolist()
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()

# Fill missing values in numerical columns with mean or median
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].median())

# Fill missing values in categorical columns with mode
for col in categorical_cols:
    data[col] = data[col].fillna(data[col].mode().iloc[0])


In [None]:
# One-hot encode categorical variables
encoded_data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)
print(encoded_data.head())
print(encoded_data.columns)

In [None]:
from sklearn.model_selection import train_test_split

X = encoded_data.drop('budget', axis=1)
y = encoded_data['budget']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Calculate R-squared score
r2 = r2_score(y_test, y_pred)
print("R-squared Score:", r2)


In [None]:

joblib.dump(model, 'model.pkl')
