In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import numpy as np
import pandas as pd
from collections import Counter 
from sklearn.utils import class_weight
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
training_data = pd.read_csv('data/aol/user-ct-test-collection-05.txt', sep="\t")
training_queries = training_data.Query.dropna()
training_tokens_freq = Counter(training_queries)

testing_data = pd.read_csv('data/aol/user-ct-test-collection-04.txt', sep="\t")
testing_queries = training_data.Query.dropna()
testing_tokens_freq = Counter(training_queries)

In [5]:
# Function to extract features
def extract_features(token):
    length = len(token)
    digits = sum(c.isdigit() for c in token)
    upper = sum(c.isupper() for c in token)
    lower = sum(c.islower() for c in token)
    special = sum(not c.isalnum() for c in token)
    entropy = -sum((token.count(c)/len(token)) * np.log2(token.count(c)/len(token)) for c in set(token))
    return [length, digits, upper, lower, special, entropy]

In [7]:
# Prepare the dataset
X_train = list(training_tokens_freq.keys())
y_train = list(training_tokens_freq.values())
X_test = list(testing_tokens_freq.keys())
y_test = list(testing_tokens_freq.values())

In [17]:
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2,2))

X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [19]:
# class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
# class_weights_dict = {i : class_weights[i] for i in range(len(class_weights))}

# Train the model
# model = LogisticRegression(class_weight=class_weights_dict)
model = LogisticRegression(max_iter=2)
model.fit(X_train_vectorized, y_train)

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =      1080747     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  8.11227D+06    |proj g|=  7.77456D+05


 This problem is unconstrained.


In [100]:
# Evaluate the model
predictions = (model.predict_proba(X_test_vectorized)[:, 1] >= 0.94).astype(int)
print("Accuracy:", accuracy_score(y_test, predictions))
print("F1 Score:", f1_score(y_test, predictions))
confusion_matrix(y_test, predictions)

Accuracy: 0.9991651328559645
F1 Score: 0.03571428571428571


array([[1292519,     541],
       [    539,      20]])

In [76]:
from Oracle import Oracle
import pickle

filename = 'models/logistic_regression.sav'
pickle.dump(model, open(filename, 'wb'))