In [4]:
# Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.metrics import accuracy_score, classification_report
import neattext.functions as nfx  # Ensure neattext is installed: pip install neattext
from sklearn.preprocessing import LabelEncoder

# Load Dataset
df = pd.read_csv(r"C:\Users\yashw\Downloads\netflix_titles.csv\netflix_titles.csv")  # Adjust path if needed
print("Dataset Loaded Successfully!")
print("Dataset Preview:")
print(df.head())

# Handle Missing Values
df.dropna(subset=['title', 'type', 'rating'], inplace=True)

# Convert 'type' and 'rating' columns to strings
df['type'] = df['type'].astype(str)
df['rating'] = df['rating'].astype(str)

# Text Cleaning
df['title'] = df['title'].str.lower()
df['title'] = df['title'].apply(nfx.remove_stopwords)

print("Cleaned Dataset Preview:")
print(df.head())

# Features and Labels
Xfeatures = df['title']  # Input feature: 'title'
ylabels = df[['type', 'rating']]  # Output labels: 'type' and 'rating'

# Convert 'title' column to numerical features using CountVectorizer
vectorizer = CountVectorizer()
Xfeatures_vectorized = vectorizer.fit_transform(Xfeatures)

# Encode 'type' and 'rating' as numeric values
le_type = LabelEncoder()
le_rating = LabelEncoder()

df['type_encoded'] = le_type.fit_transform(df['type'])
df['rating_encoded'] = le_rating.fit_transform(df['rating'])

# Convert ylabels to a NumPy array
ylabels_array = df[['type_encoded', 'rating_encoded']].to_numpy()

# Split Data into Training and Testing Sets
x_train, x_test, y_train, y_test = train_test_split(
    Xfeatures_vectorized, ylabels_array, test_size=0.3, random_state=7
)

# Use ClassifierChain with RandomForestClassifier for Multi-Output Tasks
chain = ClassifierChain(RandomForestClassifier(n_estimators=100, random_state=7))

# Train the Classifier
chain.fit(x_train, y_train)

# Evaluate the Model
accuracy = chain.score(x_test, y_test)
print("Model Accuracy Score:", accuracy)

# Detailed Metrics
y_pred = chain.predict(x_test)
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['type', 'rating']))

# Sample Prediction
sample_text = x_test[0]  # Directly use sparse matrix slice
predicted = chain.predict(sample_text)

# Convert predictions back to original labels
predicted_type = le_type.inverse_transform([predicted[0, 0]])[0]
predicted_rating = le_rating.inverse_transform([predicted[0, 1]])[0]

print("Predicted Output:")
print(f"Type: {predicted_type}, Rating: {predicted_rating}")


Dataset Loaded Successfully!
Dataset Preview:
  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-

ValueError: multiclass-multioutput is not supported

In [None]:
# Importing necessary libraries 
import numpy as np 
import pandas as pd 
from sklearn.datasets import make_multilabel_classification 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, r2_score 
# Generate the dataset 
X, Y = make_multilabel_classification(n_samples=5000, n_features=10, n_classes=2, random_state=42) 
# Convert Y into a continuous format for regression (sum of labels as continuous targets) 
# For demonstration, you can replace this logic as per your regression objective 
Y_continuous = Y.sum(axis=1) 
# Convert the dataset into a DataFrame for better visualization 
data = pd.DataFrame(X, columns=[f"Feature_{i+1}" for i in range(X.shape[1])]) 
data['Target'] = Y_continuous 
# Display the first few rows of the dataset 
print("Dataset Preview:") 
print(data.head()) 
# Define independent variables (features) and dependent variable (target) 
X_train, X_test, y_train, y_test = train_test_split(X, Y_continuous, test_size=0.3, random_state=42) 
# Create and train the multiple linear regression model 
model = LinearRegression() 
model.fit(X_train, y_train) 
# Model coefficients and intercept 
print("\nModel Coefficients:") 
print("Intercept:", model.intercept_) 
print("Coefficients:", model.coef_) 
# Make predictions 
y_pred = model.predict(X_test) 
# Evaluate the model 
mse = mean_squared_error(y_test, y_pred) 
r2 = r2_score(y_test, y_pred) 
print("\nModel Evaluation:") 
print("Mean Squared Error (MSE):", mse) 
print("R-squared (R2):", r2) 
# Visualize the predicted vs actual values 
import matplotlib.pyplot as plt
plt.scatter(y_test, y_pred, color='blue', alpha=0.5) 
plt.xlabel("Actual Target Values") 
plt.ylabel("Predicted Target Values") 
plt.title("Actual vs Predicted Target Values") 
plt.grid() 
plt.show()