In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

FILE_PATH = r'C:\Users\ASUS\Documents\datascience\imdbmovies.csv'
TARGET_COLUMN = 'Rating'
TOP_GENRES_COUNT = 10

def load_and_clean_movies(file_path):
    df = pd.read_csv(file_path, encoding="latin1")
    df.columns = df.columns.str.strip()
    df.dropna(subset=[TARGET_COLUMN], inplace=True)

    df['Votes'] = df['Votes'].astype(str).str.replace(',', '').str.strip()
    df = df[~df['Votes'].str.contains('[a-zA-Z]')]
    df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce').astype('Int64')
    df.dropna(subset=['Votes'], inplace=True)
    df = df[df['Votes'] >= 5]

    df['Year'] = df['Year'].astype(str).str.extract(r'\((\d{4})\)').astype(float).astype('Int64', errors='ignore')
    df.dropna(subset=['Year'], inplace=True)

    df['Duration'] = df['Duration'].astype(str).str.replace(' min', '').str.strip()
    df = df[~df['Duration'].str.contains('[a-zA-Z]')]
    df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce').astype('Int64')
    df.dropna(subset=['Duration'], inplace=True)

    return df

def create_genre_features(df, top_n=10):
    genres = df['Genre'].str.split(', ').explode()
    top_genres = genres.value_counts().nlargest(top_n).index.tolist()

    for genre in top_genres:
        df[f'Genre_{genre}'] = df['Genre'].apply(lambda x: 1 if pd.notna(x) and genre in x else 0)

    df.drop('Genre', axis=1, inplace=True)
    return df, top_genres

def prepare_model_data(df):
    drop_cols = ['Name', TARGET_COLUMN, 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
    X = df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore')
    y = df[TARGET_COLUMN]
    X = X.apply(pd.to_numeric, errors='coerce').fillna(0)
    return X, y

def train_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=15)
    model.fit(X_train, y_train)
    return model

def predict_movie_rating(model, movie_info, top_genres, feature_columns):
    df = pd.DataFrame([movie_info])
    for genre in top_genres:
        df[f'Genre_{genre}'] = 1 if pd.notna(df.get('Genre', pd.Series([None])).iloc[0]) and genre in df['Genre'].iloc[0] else 0
    if 'Genre' in df.columns:
        df.drop('Genre', axis=1, inplace=True)
    for col in feature_columns:
        if col not in df.columns:
            df[col] = 0
    df = df[feature_columns]
    return model.predict(df)[0]

if __name__ == '__main__':
    try:
        movies = load_and_clean_movies(FILE_PATH)
        movies, top_genres_list = create_genre_features(movies, top_n=TOP_GENRES_COUNT)
        X, y = prepare_model_data(movies)
        model = train_model(X, y)

        example_movie = movies.sample(1).iloc[0]
        genres_str = ', '.join([g.replace('Genre_', '') for g in top_genres_list if example_movie.get(f'Genre_{g}', 0) == 1])
        movie_info = {
            'Year': example_movie['Year'],
            'Duration': example_movie['Duration'],
            'Votes': example_movie['Votes'],
            'Genre': genres_str
        }

        predicted_rating = predict_movie_rating(model, movie_info, top_genres_list, X.columns)
        print(f"Example movie: {example_movie['Name']}")
        print(f"Predicted IMDb rating: {predicted_rating:.2f} ⭐")

    except FileNotFoundError:
        print(f"Oops! The file '{FILE_PATH}' was not found.")
    except Exception as e:
        print(f"Something went wrong: {e}")


Example movie: Sharafat Gayi Tel Lene
Predicted IMDb rating: 5.03 ⭐
