In [None]:
import pandas as pd
import numpy as np
train = pd.read_csv('data/si670_kaggle1_train.csv')
valid = pd.read_csv('data/si670_kaggle1_validation.csv')
test = pd.read_csv('data/test.csv')

In [None]:
train['char_length'] = train['text'].apply(len)
train['word_count'] = train['text'].str.split().apply(len) 
valid['char_length'] = valid['text'].apply(len)
valid['word_count'] = valid['text'].str.split().apply(len)
test['char_length'] = test['text'].apply(len)
test['word_count'] = test['text'].str.split().apply(len)

In [None]:
train['avg_word_len'] = train['text'].apply(lambda x: np.mean([len(w) for w in x.split()]) if len(x.split()) > 0 else 0)
valid['avg_word_len'] = valid['text'].apply(lambda x: np.mean([len(w) for w in x.split()]) if len(x.split()) > 0 else 0)
test['avg_word_len'] = test['text'].apply(lambda x: np.mean([len(w) for w in x.split()]) if len(x.split()) > 0 else 0)

In [None]:
import re

def avg_sent_len(text):
    sentences = re.split(r'[.!?]', text)
    sentences = [s.strip() for s in sentences if s.strip()] 
    if not sentences:
        return 0
    return np.mean([len(s.split()) for s in sentences])

train['avg_sent_len'] = train['text'].apply(avg_sent_len)
valid['avg_sent_len'] = valid['text'].apply(avg_sent_len)
test['avg_sent_len'] = test['text'].apply(avg_sent_len)

In [None]:
def punctuation_freq(text, punc):
    return text.count(punc) / len(text) if len(text)>0 else 0

train['comma_ratio'] = train['text'].apply(lambda x: punctuation_freq(x, ','))
train['period_ratio'] = train['text'].apply(lambda x: punctuation_freq(x, '.'))
train['question_ratio'] = train['text'].apply(lambda x: punctuation_freq(x, '?'))
train['exclamation_ratio'] = train['text'].apply(lambda x: punctuation_freq(x, '!'))
valid['comma_ratio'] = valid['text'].apply(lambda x: punctuation_freq(x, ','))
valid['period_ratio'] = valid['text'].apply(lambda x: punctuation_freq(x, '.'))
valid['question_ratio'] = valid['text'].apply(lambda x: punctuation_freq(x, '?'))
valid['exclamation_ratio'] = valid['text'].apply(lambda x: punctuation_freq(x, '!'))
test['comma_ratio'] = test['text'].apply(lambda x: punctuation_freq(x, ','))
test['period_ratio'] = test['text'].apply(lambda x: punctuation_freq(x, '.'))
test['question_ratio'] = test['text'].apply(lambda x: punctuation_freq(x, '?'))
test['exclamation_ratio'] = test['text'].apply(lambda x: punctuation_freq(x, '!'))

In [None]:
def bigram_unique_ratio(text):
    words = text.split()
    bigrams = [tuple(words[i:i+2]) for i in range(len(words)-1)]
    if not bigrams:
        return 0
    return len(set(bigrams)) / len(bigrams)

def trigram_unique_ratio(text):
    words = text.split()
    bigrams = [tuple(words[i:i+3]) for i in range(len(words)-1)]
    if not bigrams:
        return 0
    return len(set(bigrams)) / len(bigrams)

train['bigram_unique_ratio'] = train['text'].apply(bigram_unique_ratio)
train['trigram_unique_ratio'] = train['text'].apply(trigram_unique_ratio)
valid['bigram_unique_ratio'] = valid['text'].apply(bigram_unique_ratio)
valid['trigram_unique_ratio'] = valid['text'].apply(trigram_unique_ratio)
test['bigram_unique_ratio'] = test['text'].apply(bigram_unique_ratio)
test['trigram_unique_ratio'] = test['text'].apply(trigram_unique_ratio)


In [None]:
from collections import Counter

def shannon_entropy(text):
    words = text.split()
    if not words:
        return 0
    counts = np.array(list(Counter(words).values()))
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs))

train['shannon_entropy'] = train['text'].apply(shannon_entropy)
valid['shannon_entropy'] = valid['text'].apply(shannon_entropy)
test['shannon_entropy'] = test['text'].apply(shannon_entropy)

In [None]:
train.head()

In [None]:
features_for_model = ['char_length', 'word_count', 'avg_word_len', 'avg_sent_len',
                      'comma_ratio', 'period_ratio', 'question_ratio', 'exclamation_ratio',
                      'bigram_unique_ratio', 'trigram_unique_ratio', 'shannon_entropy']
X_train = train[features_for_model]
y_train = train['label']
X_valid = valid[features_for_model]
y_valid = valid['label']
X_test = test[features_for_model]

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

In [None]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
model = LogisticRegression(penalty='l2',class_weight='balanced', max_iter=100, C=0.1, random_state=670, solver='liblinear')
model.fit(X_train, y_train)
y_valid_pred = model.predict_proba(X_valid)
y_pred = (y_valid_pred[:, 1] >= 0.5).astype(int)
print('F1 score:', f1_score(y_valid, y_pred))

In [None]:
y_pred = (y_valid_pred[:, 1] >= 0.45).astype(int)
print('F1 score:', f1_score(y_valid, y_pred))