In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile

data_train = pd.read_csv("AGNews/train.csv")
data_test = pd.read_csv("AGNews/test.csv")

target = "Class Index"

x_train = data_train.drop(target, axis=1)
y_train = data_train[target]
x_test = data_test.drop(target, axis=1)
y_test = data_test[target]

preprocessor = ColumnTransformer(transformers=[
    ("Title", TfidfVectorizer(stop_words="english", ngram_range=(1, 1)), "Title"),
    ("Description", TfidfVectorizer(stop_words="english", ngram_range=(1, 2)), "Description"),
])

model = Pipeline(steps=[
    ("pre_processor", preprocessor),
    ("regressor", RandomForestClassifier(random_state=100))
])

model.fit(x_train, y_train)
y_predict = model.predict(x_test)

print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           1       0.91      0.90      0.91      1900
           2       0.92      0.98      0.95      1900
           3       0.88      0.85      0.86      1900
           4       0.88      0.87      0.87      1900

    accuracy                           0.90      7600
   macro avg       0.90      0.90      0.90      7600
weighted avg       0.90      0.90      0.90      7600



In [3]:

preprocessor = ColumnTransformer(transformers=[
    ("Title", TfidfVectorizer(stop_words="english", ngram_range=(1, 1)), "Title"),
    ("Description", TfidfVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.01, max_df=0.99), "Description"),
])

model = Pipeline(steps=[
    ("pre_processor", preprocessor),
    ("feature_selector", SelectPercentile(chi2, percentile=5)),
    ("regressor", RandomForestClassifier(random_state=100))
])

model.fit(x_train, y_train)
y_predict = model.predict(x_test)

print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           1       0.86      0.85      0.85      1900
           2       0.86      0.92      0.88      1900
           3       0.83      0.80      0.81      1900
           4       0.82      0.80      0.81      1900

    accuracy                           0.84      7600
   macro avg       0.84      0.84      0.84      7600
weighted avg       0.84      0.84      0.84      7600

