In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

# Load the dataset
data = pd.read_csv("tmdb_cleaned_movies.csv")

# Preprocess: combine 'overview' and 'genres' for classification
data['text'] = data['overview'].fillna('') + " " + data['genres'].fillna('')

# Example: Ensure there's a 'category' column to classify (e.g., Action, Comedy, etc.)
# If this column doesn't exist, you need to add or simulate it.
# For demonstration, we will add a sample category column
data['category'] = data['genres'].apply(lambda x: 'Action' if 'Action' in str(x) else 'Drama')

# Vectorize the text data
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['text'])  # Features
y = data['category']  # Labels (categories)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the KNN classifier
K = 5  # Number of neighbors
knn = KNeighborsClassifier(n_neighbors=K, metric='cosine')

# Train the model on the training set
knn.fit(X_train, y_train)

# Predict on the test set
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

# Display evaluation metrics
print("Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)

def recommend_movies(movie_title, data, vectorizer, knn_model, top_k=5):
    """
    Recommend movies similar to the given movie_title.
    
    :param movie_title: Title of the movie to base recommendations on.
    :param data: The original dataset containing movie titles and features.
    :param vectorizer: Trained TF-IDF vectorizer.
    :param knn_model: Trained KNN model.
    :param top_k: Number of recommendations to return.
    :return: List of recommended movies.
    """
    # Check if the movie exists in the dataset
    if movie_title not in data['title'].values:
        return f"Movie '{movie_title}' not found in the dataset."

    # Get the vector for the movie
    movie_index = data[data['title'] == movie_title].index[0]
    movie_vector = vectorizer.transform([data.iloc[movie_index]['text']])

    # Calculate similarities to all other movies
    distances, indices = knn_model.kneighbors(movie_vector, n_neighbors=top_k + 1)  # +1 to skip itself
    recommendations = []
    
    for idx, distance in zip(indices[0][1:], distances[0][1:]):  # Skip the first result (itself)
        recommended_movie = data.iloc[idx]['title']
        recommendations.append((recommended_movie, distance))

    return recommendations

# Example Usage
movie_title = "Iron Man"  # Replace with the movie you want recommendations for
recommendations = recommend_movies(movie_title, data, vectorizer, knn, top_k=5)

print(f"Recommendations for '{movie_title}':")
for i, (rec_title, score) in enumerate(recommendations, 1):
    print(f"{i}. {rec_title} (Similarity: {1 - score:.4f})")

Evaluation Metrics:
Accuracy: 0.8615
Precision: 0.8442
Recall: 0.8615
F1 Score: 0.8372

Confusion Matrix:
[[  97  234]
 [  43 1626]]
Recommendations for 'Iron Man':
1. G.I. Jane (Similarity: 0.2186)
2. TerrorStorm (Similarity: 0.2070)
3. Nacht vor Augen (Similarity: 0.1689)
4. Last Man Standing (Similarity: 0.1673)
5. Presumed Innocent (Similarity: 0.1673)
