In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

# Step 1: Load the dataset
df = pd.read_csv('data.csv')

# Step 2: Preprocess the data
df['retention_decision'] = (df['imdbAverageRating'] > 6.5).astype(int)

# Impute numerical columns with the mean
num_imputer = SimpleImputer(strategy='mean')
numerical_features = ['imdbAverageRating', 'imdbNumVotes', 'releaseYear']
df[numerical_features] = num_imputer.fit_transform(df[numerical_features])

# Impute categorical columns with the most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
df['type'] = cat_imputer.fit_transform(df[['type']]).ravel()

df['genres'] = df['genres'].fillna('Unknown')
df['availableCountries'] = df['availableCountries'].fillna('')

# Feature Engineering
df['num_countries'] = df['availableCountries'].apply(lambda x: len(x.split(', ')))
genres = df['genres'].str.get_dummies(sep=', ')
df['type_encoded'] = (df['type'] == 'movie').astype(int)

features = pd.concat([
    df[['imdbAverageRating', 'imdbNumVotes', 'releaseYear', 'num_countries', 'type_encoded']], 
    genres
], axis=1)
target = df['retention_decision']

# Step 3: Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Step 4: Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 5: Evaluate the model
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Step 6: Feature Importance
importances = rf_model.feature_importances_
feature_names = features.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)
print("\nFeature Importances:\n", importance_df)



# from sklearn.model_selection import cross_val_score
# from sklearn.metrics import make_scorer, accuracy_score

# # Initialize the Random Forest Classifier
# rf_model = RandomForestClassifier(random_state=42)

# # Perform 5-Fold Cross-Validation
# cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='accuracy')

# # Print Results
# print("Cross-Validation Scores for Each Fold:", cv_scores)
# print("Mean Accuracy:", cv_scores.mean())
# print("Standard Deviation:", cv_scores.std())

import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv("data.csv")

# Inspect the columns to find the correct target
print(df.columns)

# Define features and target
X = df.drop(columns=['imdbId', 'type', 'genres', 'availableCountries'])  # Drop unnecessary columns
y = df['retained']  # Replace 'retained' with the actual target column name

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost model
xgb_model = XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Feature Importances
print("Feature Importances:")
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values(by='Importance', ascending=False)
print(feature_importance)




Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       660
           1       1.00      1.00      1.00       780

    accuracy                           1.00      1440
   macro avg       1.00      1.00      1.00      1440
weighted avg       1.00      1.00      1.00      1440


Feature Importances:
                Feature  Importance
0    imdbAverageRating    0.782705
1         imdbNumVotes    0.070059
2          releaseYear    0.035779
3        num_countries    0.021896
4         type_encoded    0.013465
14               Drama    0.011623
10           Biography    0.009374
13         Documentary    0.007724
38             Unknown    0.006526
20              Horror    0.004724
11              Comedy    0.004251
8            Adventure    0.003296
5               Action    0.003277
9            Animation    0.002735
12               Crime    0.002713
19             History    0.001819
37            Th

KeyError: 'retained'