<a href="https://colab.research.google.com/github/venkatagollapalli28-netizen/codsoft-/blob/main/MOVIE_RATING__PREDICTION_WITH_PYTHON.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
from scipy.sparse import hstack

# 1. Create a mock dataset
data = {
    'Title': ['Movie A', 'Movie B', 'Movie C', 'Movie D', 'Movie E'],
    'Year': [2000, 2005, 2010, 2015, 2020],
    'Duration': [120, 90, 150, 110, 140],
    'Genre': ['Action, Adventure', 'Drama', 'Comedy, Romance', 'Action', 'Drama, Thriller'],
    'Director': ['Director 1', 'Director 2', 'Director 3', 'Director 1', 'Director 4'],
    'Actor1': ['Actor A', 'Actor B', 'Actor C', 'Actor A', 'Actor D'],
    'Actor2': ['Actor E', 'Actor F', 'Actor G', 'Actor H', 'Actor I'],
    'Rating': [7.5, 8.0, 6.5, 7.0, 8.5],
    'Votes': [150000, 80000, 120000, 50000, 200000]
}
df = pd.DataFrame(data)

# 2. Preprocessing
df['Actors_Combined'] = df['Actor1'] + ', ' + df['Actor2']

# 3. TF-IDF transformations
tfidf_genre = TfidfVectorizer()
X_genre = tfidf_genre.fit_transform(df['Genre'])

tfidf_director = TfidfVectorizer()
X_director = tfidf_director.fit_transform(df['Director'])

tfidf_actors = TfidfVectorizer()
X_actors = tfidf_actors.fit_transform(df['Actors_Combined'])

# 4. Numeric features
X_numeric = df[['Year', 'Duration', 'Votes']].values
y = df['Rating'].values

# 5. Combine features
X = hstack([X_genre, X_director, X_actors, X_numeric])

# 6. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# 7. Model training
models = {
    'Ridge': Ridge(),
    'RandomForest': RandomForestRegressor(n_estimators=50, random_state=42)
}
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    results[name] = {
        'MAE': mean_absolute_error(y_test, preds),
        'MSE': mean_squared_error(y_test, preds),
        'RMSE': np.sqrt(mean_squared_error(y_test, preds)),
        'R2': r2_score(y_test, preds)
    }

# 8. Output results
for name, metrics in results.items():
    print(f"{name} Results: {metrics}")

best_name = min(results, key=lambda k: results[k]['RMSE'])
best_model = models[best_name]
print(f"\nBest model: {best_name}")

# 9. Save model and vectorizers
joblib.dump(best_model, 'best_model_demo.joblib')
joblib.dump({'genre': tfidf_genre, 'director': tfidf_director, 'actors': tfidf_actors}, 'vectorizers_demo.joblib')

# 10. Predict function
def predict_rating(movie_dict):
    vecs = joblib.load('vectorizers_demo.joblib')
    model = joblib.load('best_model_demo.joblib')
    g = vecs['genre'].transform([movie_dict['Genre']])
    d = vecs['director'].transform([movie_dict['Director']])
    a = vecs['actors'].transform([movie_dict['Actors_Combined']])
    num = np.array([[movie_dict['Year'], movie_dict['Duration'], movie_dict['Votes']]])
    features = hstack([g, d, a, num])
    return model.predict(features)[0]

# 11. Example prediction
new_movie = {
    'Genre': 'Action, Sci-Fi',
    'Director': 'Director 5',
    'Actors_Combined': 'Actor X, Actor Y',
    'Year': 2022,
    'Duration': 130,
    'Votes': 100000
}
print("Predicted rating for new movie:", predict_rating(new_movie))


Ridge Results: {'MAE': 0.7470268901419725, 'MSE': 0.8017622322475052, 'RMSE': np.float64(0.8954117668690228), 'R2': -11.828195715960083}
RandomForest Results: {'MAE': 1.3000000000000003, 'MSE': 1.7924000000000007, 'RMSE': np.float64(1.3388054376943652), 'R2': -27.67840000000001}

Best model: Ridge
Predicted rating for new movie: 6.81440971285326
