In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
# Path to the folder containing cleaned CSV files
folder_path = 'cleanedtvshows'

# List all cleaned CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Initialize an empty DataFrame to store combined data
combined_df = pd.DataFrame()

# Iterate through each cleaned CSV file
for file_name in csv_files:
    # Read the CSV file into a DataFrame
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_csv(file_path)

    # Concatenate current DataFrame with combined DataFrame
    combined_df = pd.concat([combined_df, df], ignore_index=True)

# Preprocessing: Drop irrelevant columns and handle missing values
# For example:
# df.drop(['irrelevant_column1', 'irrelevant_column2'], axis=1, inplace=True)
# df.dropna(inplace=True)
df = combined_df
# Split the dataset into features (X) and target variable (y)
X = df.drop(columns=['show_id'])  # Features
y = (df['vote_average']+df['popularity']+df['number_of_seasons'])/3  # Target variable

# Encode categorical variables if needed
# For example, using one-hot encoding:
X = pd.get_dummies(X)

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the logistic regression model
logreg_model = LogisticRegression(max_iter=1000, random_state=42)
logreg_model.fit(X_train, y_train)

# Predictions on the test set
y_pred = logreg_model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

MemoryError: Unable to allocate 56.9 GiB for an array with shape (21587, 2831561) and data type bool

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# Select two features for visualization
feature1 = 'popularity'
feature2 = 'vote_average'

# Extract the selected features from the dataset
X_subset = X_train[[feature1, feature2]].values

# Fit logistic regression model on the subset of features
logreg_model_subset = LogisticRegression(max_iter=1000, random_state=42)
logreg_model_subset.fit(X_subset, y_train)

# Plot decision boundary
h = .02  # step size in the mesh
x_min, x_max = X_subset[:, 0].min() - 1, X_subset[:, 0].max() + 1
y_min, y_max = X_subset[:, 1].min() - 1, X_subset[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = logreg_model_subset.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(8, 6))
plt.contourf(xx, yy, Z, cmap=plt.cm.RdBu, alpha=0.8)

# Plot the training points
plt.scatter(X_subset[:, 0], X_subset[:, 1], c=y_train, cmap=plt.cm.RdBu, edgecolors='k')

plt.xlabel(feature1)
plt.ylabel(feature2)
plt.title("Logistic Regression Decision Boundary")
plt.show()