In [None]:
from sklearn.datasets import load_files
import numpy as np

In [None]:
reviews_train = load_files("./aclImdb_v1/aclImdb/train/")
# load_files返回一个Bunch对象，其中包含训练文本和训练标签
text_train, y_train = reviews_train.data, reviews_train.target
print("type of text_train: {}".format(type(text_train)))
print("length of text_train: {}".format(len(text_train)))
print("text_train[0]:\n{}".format(text_train[0]))

### 对于 v1.0 版数据，其训练集大小是 75 000，而不是 25 000，因为其中还包含 50 000 个用于无监督学习的无标签文档。在进行后续操作之前，建议先将这 50 000 个无标签文档从训练集中剔除。

In [None]:
y_train.shape

In [None]:
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]

In [None]:
print("Samples per class (training): {}".format(np.bincount(y_train)))

In [None]:
reviews_test = load_files("./aclImdb_v1/aclImdb/test/")
text_test, y_test = reviews_test.data, reviews_test.target
print("Number of documents in test data: {}".format(len(text_test)))
print("Samples per class (test): {}".format(np.bincount(y_test)))
text_test = [doc.replace(b"<br />", b" ") for doc in text_test]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
X_test = vect.transform(text_test)

In [None]:
feature_names = vect.get_feature_names_out()
print("Number of features: {}".format(len(feature_names)))
print("First 20 features:\n{}".format(feature_names[:20]))

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import pickle



scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))


# logicreg = LogisticRegression(max_iter=4000)
# logicreg.fit(X_train, y_train)
# 保存模型,我们想要导入的是模型本身，所以用“wb”方式写入，即是二进制方式,DT是模型名字
# pickle.dump(logicreg, open("textlogicreg.dat","wb"))   # open("dtr.dat","wb")意思是打开叫"dtr.dat"的文件,操作方式是写入二进制数据

# 加载模型 
# loaded_model = pickle.load(open("textlogicreg.dat","rb"))

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(max_iter=4000), param_grid, cv=5)
grid.fit(X_train, y_train)

print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)