In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
data = pd.read_pickle('data/train.pkl')

Features for bot classification:
- 'bot' in username
- type of change (one-hot encoded)
- whether change is minor - does not work well because of Nan values
- time between edits
- length of commentary
- content of commentary (tf-idf or Bow)
- length of revision ('length.new'-'length.old')
- Number of requests per minute

In [3]:
data['bot'] = data['bot'].astype(int)

In [4]:
data['bot_in_username'] = data['user'].str.contains('bot', case=False).astype(int)

In [5]:
data['comment_len'] = data['comment'].str.len()

In [6]:
data['revision_len'] = data['length.new'] - data['length.old']
data['revision_len'] = data['revision_len'].fillna(0)

In [7]:
#one-hot encode type
data = pd.concat([data, data['type'].str.get_dummies()], axis=1)

In [8]:
data['datetime'] = pd.to_datetime(data['timestamp'], unit='s')

In [9]:
# avg changes by user per minute
changes_per_min = data.groupby(['user', pd.Grouper(key='datetime', freq='min')]).size().reset_index()\
    .rename(columns={0:'changes_per_min', 'datetime': 'minute'})
avg_changes_per_min = changes_per_min.groupby('user').agg(avg_changes_per_min=('changes_per_min', 'mean')).reset_index()
data = pd.merge(data, avg_changes_per_min, how='left', on='user')

## Train/test split

It would be logical to do split on users instead of items (i.e. so that the same user does not appear in both train and test data, thus creating bias).

In [10]:
all_users = data['user'].unique().tolist()
train_size = int(len(all_users) * 0.7)
np.random.seed(119)
train_users = np.random.choice(all_users, train_size)
train = data[data['user'].isin(train_users)]
test = data[~data['user'].isin(train_users)]
train.shape[0], test.shape[0]

(26247, 6649)

## BoW

We will also use comment contents as features. For this, we will encode them with bag of words.

In [11]:
bow = CountVectorizer(max_df=0.95, min_df=0.01)
bow_data = bow.fit(train['comment'].values)

In [12]:
text_colnames = [i + '_' for i in bow_data.get_feature_names_out()]
train_bow = pd.DataFrame(bow_data.transform(train['comment']).toarray(), columns=text_colnames)
test_bow = pd.DataFrame(bow_data.transform(test['comment']).toarray(), columns=text_colnames)

In [13]:
train = pd.concat([train.reset_index(drop=True), train_bow], axis=1)
test = pd.concat([test.reset_index(drop=True), test_bow], axis=1)

In [14]:
indep_vars = ['bot_in_username', 'comment_len', 'revision_len', 'avg_changes_per_min']
indep_vars.extend(text_colnames)

In [15]:
X_train = train[indep_vars]
y_train = train['bot']
X_test = test[indep_vars]
y_test = test['bot']

## Classification

### Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [18]:
lr = LogisticRegression(solver='liblinear', max_iter=300)

In [35]:
lr.fit(X_train, y_train)

In [37]:
pred = lr.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5812
           1       1.00      0.97      0.98       837

    accuracy                           1.00      6649
   macro avg       1.00      0.98      0.99      6649
weighted avg       1.00      1.00      1.00      6649



## Random forest + feature importances

Random forest classifier, apart from potentially improving results, has a convenient feature importance score (Gini importance) that does not require feature standardization.

In [16]:
from sklearn.ensemble import RandomForestClassifier
import plotly.express as px

In [17]:
forest = RandomForestClassifier(random_state=99)
forest.fit(X_train, y_train)

In [20]:
pred = forest.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5812
           1       0.99      0.99      0.99       837

    accuracy                           1.00      6649
   macro avg       1.00      1.00      1.00      6649
weighted avg       1.00      1.00      1.00      6649



In [21]:
feature_names = X_train.columns.tolist()
importances = forest.feature_importances_

In [31]:
forest_importances = pd.DataFrame(zip(importances, feature_names), columns=['Mean decrease in impurity', 'feature name'])
forest_importances = forest_importances.sort_values('Mean decrease in impurity', ascending=False).reset_index(drop=True)

In [33]:
fig = px.bar(forest_importances[0:25], x='Mean decrease in impurity', y='feature name',
             orientation='h')
fig.update_layout(barmode='stack', yaxis={'categoryorder':'total ascending'},
                  width=800, height=800)

# fig.update_layout(yaxis=dict(autorange="reversed"))

Judging from the chart, by far two most important features are the appearance of 'bot' in name and average number of changes by user per minute.