In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

# Step 1: Load the dataset
df = pd.read_csv('data.csv')


# Step 2: Preprocess the data
df['retention_decision'] = (df['imdbAverageRating'] > 6.5).astype(int)

# Impute numerical columns with the mean
num_imputer = SimpleImputer(strategy='mean')
numerical_features = ['imdbAverageRating', 'imdbNumVotes', 'releaseYear']
df[numerical_features] = num_imputer.fit_transform(df[numerical_features])

# Impute categorical columns with the most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
df['type'] = cat_imputer.fit_transform(df[['type']]).ravel()

df['genres'] = df['genres'].fillna('Unknown')
df['availableCountries'] = df['availableCountries'].fillna('')

# Feature Engineering
df['num_countries'] = df['availableCountries'].apply(lambda x: len(x.split(', ')))
genres = df['genres'].str.get_dummies(sep=', ')
df['type_encoded'] = (df['type'] == 'movie').astype(int)

features = pd.concat([
    df[['imdbAverageRating', 'imdbNumVotes', 'releaseYear', 'num_countries', 'type_encoded']], 
    genres
], axis=1)
target = df['retention_decision']

# Step 3: Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Step 4: Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 5: Evaluate the model
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Step 6: Feature Importance
importances = rf_model.feature_importances_
feature_names = features.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)
print("\nFeature Importances:\n", importance_df)



# from sklearn.model_selection import cross_val_score
# from sklearn.metrics import make_scorer, accuracy_score

# # Initialize the Random Forest Classifier
# rf_model = RandomForestClassifier(random_state=42)

# # Perform 5-Fold Cross-Validation
# cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='accuracy')

# # Print Results
# print("Cross-Validation Scores for Each Fold:", cv_scores)
# print("Mean Accuracy:", cv_scores.mean())
# print("Standard Deviation:", cv_scores.std())

import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv("data.csv")

# Inspect the columns to find the correct target
print(df.columns)

# Define features and target
X = df.drop(columns=['imdbId', 'type', 'genres', 'availableCountries'])  # Drop unnecessary columns
y = df['imdbAverageRating']  #Replace 'retained' with the actual target column name

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost model
xgb_model = XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Feature Importances
print("Feature Importances:")
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values(by='Importance', ascending=False)
print(feature_importance)




Accuracy: 0.9991304347826087

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       537
           1       1.00      1.00      1.00       613

    accuracy                           1.00      1150
   macro avg       1.00      1.00      1.00      1150
weighted avg       1.00      1.00      1.00      1150


Feature Importances:
                Feature  Importance
0    imdbAverageRating    0.789962
1         imdbNumVotes    0.071471
2          releaseYear    0.028112
3        num_countries    0.023687
4         type_encoded    0.019773
38             Unknown    0.007988
14               Drama    0.007611
10           Biography    0.007008
13         Documentary    0.005083
11              Comedy    0.004190
20              Horror    0.003599
8            Adventure    0.003277
9            Animation    0.003007
5               Action    0.002864
12               Crime    0.002583
37            Thriller    0.001957
1

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73], got [1.5 1.6 2.  2.2 2.4 2.5 2.8 2.9 3.  3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9
 4.  4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5.  5.1 5.2 5.3 5.4 5.5 5.6 5.7
 5.8 5.9 6.  6.1 6.2 6.3 6.4 6.5 6.6 6.7 6.8 6.9 7.  7.1 7.2 7.3 7.4 7.5
 7.6 7.7 7.8 7.9 8.  8.1 8.2 8.3 8.4 8.5 8.6 8.7 8.8 8.9 9.  9.1 9.2 9.3
 9.4 nan]