Task Details : Create a machine learning model that can predict the genre of a movie based on its plot summary or other textual information. You can use techniques like TF-IDF or word embeddings with classifiers such as Naive Bayes, Logistic Regression, or Support Vector Machines.

In [29]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import joblib


In [30]:
df = pd.read_csv("train_data.txt", sep=":::", engine="python", header=None, names=["id", "title", "genre", "description"])
df.dropna(subset=["genre", "description"], inplace=True)
df["genre"] = df["genre"].str.lower().str.strip()
df["description"] = df["description"].str.strip()


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54214 entries, 0 to 54213
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           54214 non-null  int64 
 1   title        54214 non-null  object
 2   genre        54214 non-null  object
 3   description  54214 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.7+ MB


In [32]:
df.head()

Unnamed: 0,id,title,genre,description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [33]:
genre_counts = df['genre'].value_counts()
rare_genres = genre_counts[genre_counts < 2].index
df['genre'] = df['genre'].apply(lambda x: 'other' if x in rare_genres else x)

In [34]:
df['genre']

0              drama
1           thriller
2              adult
3              drama
4              drama
            ...     
54209         comedy
54210         horror
54211    documentary
54212         comedy
54213        history
Name: genre, Length: 54214, dtype: object

In [35]:
tfidf = TfidfVectorizer(stop_words="english", max_features=3000)
X = tfidf.fit_transform(df["description"])
y = df["genre"]

In [36]:

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [37]:
models = [
    ("Naive Bayes", MultinomialNB()),
    ("Logistic Regression", LogisticRegression(max_iter=300, solver="liblinear", class_weight="balanced")),
     ("SVM", LinearSVC())
]


In [38]:
results = {}
conf_matrices = {}


In [39]:

results = {}
conf_matrices = {}  

for name, model in models:
    print(f"\nTraining: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)

    print(f"{name} Accuracy: {acc:.4f}")

    cm = confusion_matrix(y_val, y_pred)

    results[name] = {
        "accuracy": acc,
        "model": model,
        "matrix": cm
    }

    conf_matrices[name] = cm 



Training: Naive Bayes
Naive Bayes Accuracy: 0.5191

Training: Logistic Regression
Logistic Regression Accuracy: 0.4886

Training: SVM
SVM Accuracy: 0.5589


In [40]:
best_model_name = max(results, key=lambda name: results[name]["accuracy"])
best_model = results[best_model_name]["model"]

print(f"\n💾 Saving best model: {best_model_name}")
joblib.dump(best_model, "best_genre_model.joblib")
joblib.dump(tfidf, "tfidf_vectorizer.joblib")



💾 Saving best model: SVM


['tfidf_vectorizer.joblib']

In [41]:
test_df = pd.read_csv("test_data.txt", sep=":::", engine="python", header=None, names=["id", "title", "description"])
test_df.dropna(subset=["description"], inplace=True)
test_df["description"] = test_df["description"].str.strip()

loaded_model = joblib.load("best_genre_model.joblib")
loaded_vectorizer = joblib.load("tfidf_vectorizer.joblib")

X_final_test = loaded_vectorizer.transform(test_df["description"])
test_preds = loaded_model.predict(X_final_test)

test_df["predicted_genre"] = test_preds


In [42]:
print("\n🎬 Predicted Genres on Unlabeled Test Set:\n")
for idx, row in test_df.iterrows():
    print(f"{row['id']} :: {row['title']} → {row['predicted_genre']}")
    if idx >= 9:  
        break


🎬 Predicted Genres on Unlabeled Test Set:

1 ::  Edgar's Lunch (1998)  → short
2 ::  La guerra de papá (1977)  → drama
3 ::  Off the Beaten Track (2010)  → documentary
4 ::  Meu Amigo Hindu (2015)  → drama
5 ::  Er nu zhai (1955)  → drama
6 ::  Riddle Room (2016)  → drama
7 ::  L'amica (1969)  → drama
8 ::  Ina Mina Dika (1989)  → comedy
9 ::  Equinox Special: Britain's Tornados (2005)  → documentary
10 ::  Press (2011)  → drama
