In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load datasets
df_train = pd.read_csv("big_data.csv")
df_test = pd.read_csv("sentiment-topic-test.tsv", sep="\t")

# Clean training data
df_train["review"] = df_train["review"].astype(str).str.strip()
df_train = df_train[df_train["review"] != ""]

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(df_train["review"])
X_test = vectorizer.transform(df_test["sentence"])

# Train model
clf = LogisticRegression()
clf.fit(X_train, df_train["label"])

# Predict
df_test["predicted_topic"] = clf.predict(X_test)

# Show some results
print(df_test[["sentence", "topic", "predicted_topic"]].head())

# Evaluate performance
print("\nClassification Report:")
print(classification_report(df_test["topic"], df_test["predicted_topic"], digits=3))



                                            sentence   topic predicted_topic
0  The stadium was alive with the roar of the cro...  sports          sports
1  That last-minute goal had me jumping out of my...  sports            book
2  I couldn’t put the book down; it swept me into...    book            book
3  The story had its moments, though some parts f...    book            book
4  I enjoyed the way the timelines shifted, even ...    book            book

Classification Report:
              precision    recall  f1-score   support

        book      0.667     1.000     0.800         6
       movie      1.000     0.500     0.667         6
      sports      0.833     0.833     0.833         6

    accuracy                          0.778        18
   macro avg      0.833     0.778     0.767        18
weighted avg      0.833     0.778     0.767        18

