In [72]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import numpy as np
import pandas as pd
from collections import Counter 
from sklearn.utils import class_weight

In [73]:
training_data = pd.read_csv('data/aol/user-ct-test-collection-05.txt', sep="\t")
training_queries = training_data.Query.dropna()
training_tokens_freq = Counter(training_queries)

testing_data = pd.read_csv('data/aol/user-ct-test-collection-04.txt', sep="\t")
testing_queries = training_data.Query.dropna()
testing_tokens_freq = Counter(training_queries)

In [74]:
# Function to extract features
def extract_features(token):
    length = len(token)
    digits = sum(c.isdigit() for c in token)
    upper = sum(c.isupper() for c in token)
    lower = sum(c.islower() for c in token)
    special = sum(not c.isalnum() for c in token)
    entropy = -sum((token.count(c)/len(token)) * np.log2(token.count(c)/len(token)) for c in set(token))
    return [length, digits, upper, lower, special, entropy]

# Prepare the dataset
X_train = np.array([extract_features(token) for token, _ in training_tokens_freq.items()])
y_train = np.array([1 if freq >= 200 else 0 for _, freq in training_tokens_freq.items()])
X_test = np.array([extract_features(token) for token, _ in testing_tokens_freq.items()])
y_test = np.array([1 if freq >= 200 else 0 for _, freq in testing_tokens_freq.items()])

class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i : class_weights[i] for i in range(len(class_weights))}

# Train the model
model = LogisticRegression(class_weight=class_weights_dict)
model.fit(X_train, y_train)

In [100]:
# Evaluate the model
predictions = (model.predict_proba(X_test)[:, 1] >= 0.94).astype(int)
print("Accuracy:", accuracy_score(y_test, predictions))
print("F1 Score:", f1_score(y_test, predictions))
confusion_matrix(y_test, predictions)

Accuracy: 0.9991651328559645
F1 Score: 0.03571428571428571


array([[1292519,     541],
       [    539,      20]])

In [76]:
from Oracle import Oracle
import pickle

filename = 'models/logistic_regression.sav'
pickle.dump(model, open(filename, 'wb'))