In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

try:
    df = pd.read_csv('IMDB Dataset.csv')
except FileNotFoundError:
    print("错误：请确保你已经下载了 'IMDB Dataset.csv' 文件，并把它和 notebook 放在同一个文件夹里！")

    exit()

print("成功加载数据集！数据集有 {} 条评论。".format(len(df)))
print("数据集前5行预览：")
print(df.head())


# 数据预处理：将标签从文字（positive/negative）转换成数字（1/0）
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

X = df['review']
y = df['sentiment']


# test_size=0.2，20%用作测试集，80%用作训练集
# random_state=42 确保每次划分的结果都一样，便于复现实验
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\n数据划分完成：")
print(f"训练集（教材）数量: {len(X_train)}")
print(f"测试集（考卷）数量: {len(X_test)}")


# 将文本转换成数字
vectorizer = CountVectorizer(max_features=5000) # 限制只用最常见的5000个词作为“词典”

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


# 创建并训练更强大的模型
model = LogisticRegression(max_iter=1000) 
model.fit(X_train_vec, y_train)

print("\n模型训练完成！")


# 8. 在测试集上评估模型
predictions = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, predictions)

print(f"\n模型在【测试集】上的准确率是: {accuracy * 100:.2f}%") 


# 进行测试
print("\n--- 开始测试新的评论 ---")
new_reviews = [
    'The plot was amazing and the actors were great!',
    'I was so disappointed with this movie.',
    'It was an okay movie, not bad but not good either.'
]

# 用训练好的转换器进行转换
new_reviews_transformed = vectorizer.transform(new_reviews)

# 进行预测
new_predictions = model.predict(new_reviews_transformed)

# 打印结果
print("对新评论的预测结果 (1=积极, 0=消极):")
for review, sentiment in zip(new_reviews, new_predictions):
    print(f"'{review}' -> {'积极 (Positive)' if sentiment == 1 else '消极 (Negative)'}")

成功加载数据集！数据集有 50000 条评论。
数据集前5行预览：
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

数据划分完成：
训练集（教材）数量: 40000
测试集（考卷）数量: 10000

模型训练完成！

模型在【测试集】上的准确率是: 87.66%

--- 开始测试新的评论 ---
对新评论的预测结果 (1=积极, 0=消极):
'The plot was amazing and the actors were great!' -> 积极 (Positive)
'I was so disappointed with this movie.' -> 消极 (Negative)
'It was an okay movie, not bad but not good either.' -> 消极 (Negative)
