In [1]:
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import time

In [2]:
df_all = pd.read_csv("../data/preprocessed/data.csv")

In [3]:
from sklearn.model_selection import train_test_split

X = df_all["lemmatized"].apply(" ".join)   # Data
y = df_all["rating"]                       # Labels

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=df_all["category"]
)

In [4]:
def my_tokenizer(text):
    return text

In [5]:
ngram_configs = [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6)]

# Logistic Regression

In [6]:
results = []

for ngram in ngram_configs:
    print(f"N-Gram: {ngram}...")
    
    start_time = time.time()
    
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            lowercase=False,        
            tokenizer=my_tokenizer, 
            token_pattern=None,
            ngram_range=ngram, 
            max_features=15000,
            min_df = 5,
            max_df = 0.95
        )),
        ('logreg', LogisticRegression(
            C=0.1, 
            class_weight='balanced',
            max_iter=1000,
            solver="saga"
        ))
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    end_time = time.time()
    duration = end_time - start_time
    
    score = f1_score(y_test, y_pred, average='macro')
    
    results.append({
        'ngram_range': str(ngram),
        'f1_score_macro': score,
        'duration': round(duration, 2)
    })

    print(f"Finished. Duration: {duration:.2f}s | F1-Score: {score:.4f}")

N-Gram: (1, 1)...
Finished. Duration: 11.36s | F1-Score: 0.3442
N-Gram: (1, 2)...
Finished. Duration: 43.98s | F1-Score: 0.3441
N-Gram: (1, 3)...
Finished. Duration: 59.31s | F1-Score: 0.4876
N-Gram: (1, 4)...
Finished. Duration: 90.59s | F1-Score: 0.4925
N-Gram: (1, 5)...
Finished. Duration: 133.41s | F1-Score: 0.5237
N-Gram: (1, 6)...
Finished. Duration: 179.45s | F1-Score: 0.5341


In [7]:
df_results = pd.DataFrame(results)
print(df_results)

  ngram_range  f1_score_macro  duration
0      (1, 1)        0.344152     11.36
1      (1, 2)        0.344090     43.98
2      (1, 3)        0.487635     59.31
3      (1, 4)        0.492500     90.59
4      (1, 5)        0.523660    133.41
5      (1, 6)        0.534081    179.45


# Naive Bayes

In [8]:
results = []

for ngram in ngram_configs:
    print(f"N-Gram: {ngram}...")

    start_time = time.time()
    
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            lowercase=False,        
            tokenizer=my_tokenizer, 
            token_pattern=None,
            ngram_range=ngram, 
            max_features=15000,
            min_df = 5,
            max_df = 0.95
        )),
        ('naivebayes', MultinomialNB(
            class_prior=None, 
            alpha = 1
        ))
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    end_time = time.time()
    duration = end_time - start_time
    
    score = f1_score(y_test, y_pred, average='macro')
    
    results.append({
        'ngram_range': str(ngram),
        'f1_score_macro': score,
        'duration': round(duration, 2)
    })

    print(f"Finished. Duration: {duration:.2f}s | F1-Score: {score:.4f}")

N-Gram: (1, 1)...
Finished. Duration: 7.08s | F1-Score: 0.3021
N-Gram: (1, 2)...
Finished. Duration: 28.86s | F1-Score: 0.3141
N-Gram: (1, 3)...
Finished. Duration: 54.22s | F1-Score: 0.4336
N-Gram: (1, 4)...
Finished. Duration: 82.51s | F1-Score: 0.4594
N-Gram: (1, 5)...
Finished. Duration: 113.89s | F1-Score: 0.5025
N-Gram: (1, 6)...
Finished. Duration: 155.89s | F1-Score: 0.5151


In [9]:
df_results = pd.DataFrame(results)
print(df_results)

  ngram_range  f1_score_macro  duration
0      (1, 1)        0.302076      7.08
1      (1, 2)        0.314142     28.86
2      (1, 3)        0.433550     54.22
3      (1, 4)        0.459424     82.51
4      (1, 5)        0.502495    113.89
5      (1, 6)        0.515082    155.89
