In [None]:
import pandas as pd
import joblib


# Load the dataset
data = pd.read_csv("../data/final_movie_data.csv")

# Display the first few rows of the dataframe
print(data.head())


for each_one in data.columns.to_list() : 
    print(each_one) 

In [None]:
# List of columns to keep
columns_to_keep = ['adult', 'budget', 'genres_1_name', 'overview', 'popularity', 
                   'production_companies_1_origin_country', 'revenue', 'runtime', 
                   'spoken_languages_1_english_name', 'spoken_languages_1_iso_639_1', 
                   'spoken_languages_1_name', 'status', 'tagline', 'title', 'video', 
                   'vote_average', 'vote_count']

# Keep only the selected columns
data = data[columns_to_keep]

print(data.head())


In [54]:
# Identify numerical and categorical columns
numerical_cols = data.select_dtypes(include=['number']).columns.tolist()
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()

# Fill missing values in numerical columns with mean or median
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].median())

# Fill missing values in categorical columns with mode
for col in categorical_cols:
    data[col] = data[col].fillna(data[col].mode().iloc[0])


In [55]:
# One-hot encode categorical variables
encoded_data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)


In [56]:
from sklearn.model_selection import train_test_split

X = encoded_data.drop('budget', axis=1)
y = encoded_data['budget']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LinearRegression

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Calculate R-squared score
r2 = r2_score(y_test, y_pred)
print("R-squared Score:", r2)


In [None]:

# Assuming your model is named 'model'
joblib.dump(model, 'model.pkl')
