In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

os.chdir('/home/trevor/disinformation')

In [2]:
# Import the data frames
wf = pd.read_csv('data/datasets/fake_domain_word_freq.csv')
df = pd.read_csv('data/datasets/fake_real_domains_combined.csv')
rs = pd.read_csv('data/datasets/reg_scores.csv')

In [3]:
# Score domain keywords based on word frequencies in the sample
for e, dom in enumerate(df['dom_split']):
    dom_split = dom.split(' ')
    score = 0
    for word in dom_split:
        try:
            prob = wf.loc[wf['word'] == word, 'samp_freq'].values[0]
        except IndexError: prob = 0
        score += prob
    df.loc[e, 'dom_score'] = score
mult = 1/df['dom_score'].max()
df['dom_score'] = df['dom_score'] * mult

In [4]:
# Score registrars based on word frequencies in the sample
for e, reg in enumerate(df['registrar']):
    score = 0
    other_val = rs.loc[rs['registrar'] == 'Other', 'reg_score'].values[0]
    try:
        prob = rs.loc[rs['registrar'] == reg, 'reg_score'].values[0]
    except IndexError: prob = other_val
    df.loc[e, 'reg_score'] = score

df.tail()

Unnamed: 0,rank,domain,tld,registrar,updated_date,creation_date,expiration_date,trust,dom_split,domain_length,keyword_length,num_nameservers,dom_age_days,dom_last_update,days_to_exp,update_to_exp,dom_score,reg_score
25296,,altervista,org,tucows,2020-08-29T20:04:42Z,2000-12-22T18:05:39Z,2025-12-22T18:05:39Z,initial trust,altervista,10,1,4,7251.736708,61.654034,1879.263292,1817.609257,0.0,0.0
25297,,at,ua,тов інтернет інвест,2020-07-17T13:34:02Z,2007-08-01T15:33:00Z,2021-08-01T15:32:58Z,initial trust,at,2,1,2,4838.842715,104.925331,275.157262,170.231931,0.0,0.0
25298,,readthedocs,io,namecheap,2020-05-15T07:14:10Z,2014-06-14T19:58:22Z,2021-06-14T19:58:22Z,initial trust,read the docs,11,3,4,2329.658433,168.189127,227.341567,59.15244,0.0,0.0
25299,,web,app,markmonitor,2019-12-12T10:38:09Z,2019-01-08T22:05:04Z,2021-01-08T22:05:04Z,initial trust,web,3,1,4,660.570446,323.047472,70.429553,-252.617919,0.004525,0.0
25300,,plesk,page,namecheap,2020-06-08T11:00:08Z,2020-03-18T03:06:27Z,2021-03-18T03:06:27Z,initial trust,plesk,5,1,3,226.361152,144.032206,138.638847,-5.393358,0.0,0.0


In [5]:
# Fill NAs for rank with the highest known one
df['rank'] = df['rank'].fillna(df['rank'].max())

In [6]:
features = ['domain_length', 'keyword_length', 'num_nameservers',
            'dom_age_days', 'dom_score', 'reg_score', 'rank']

len(df[features].dropna()) % 3

2

In [7]:
# Ensure even splits for the train/test/validate sets
print('Starting length: {}'.format(len(df)))
remove_n = len(df) % 3
drop_indices = np.random.choice(df.index, remove_n, replace=False)
df = df.drop(drop_indices)
print('Ending length: {}'.format(len(df)))

Starting length: 25301
Ending length: 25299


In [8]:
# Create the split datasets for train, validate, and test
train, validate, test = np.split(
    df.sample(frac=1, random_state=42), [int(.6*len(df)), int(.8*len(df))])

y = train['trust']

In [9]:
# Check for NaN values in the sets
print(train[features].isnull().sum().sum())
print(validate[features].isnull().sum().sum())
print(test[features].isnull().sum().sum())

0
0
0


In [10]:
df[features].tail()

Unnamed: 0,domain_length,keyword_length,num_nameservers,dom_age_days,dom_score,reg_score,rank
25296,10,1,4,7251.736708,0.0,0.0,765288.0
25297,2,1,2,4838.842715,0.0,0.0,765288.0
25298,11,3,4,2329.658433,0.0,0.0,765288.0
25299,3,1,4,660.570446,0.004525,0.0,765288.0
25300,5,1,3,226.361152,0.0,0.0,765288.0


In [15]:
# Create confusion matrix
pd.crosstab(test['trust'], predictions, rownames=['Actual trust'], colnames=['Predicted trust'])

Predicted trust,fake,initial trust
Actual trust,Unnamed: 1_level_1,Unnamed: 2_level_1
fake,508,24
initial trust,2,4526


In [16]:
# View a list of the features and their importance scores
list(zip(train[features], model.feature_importances_))

[('domain_length', 0.04914980029597304),
 ('keyword_length', 0.00864958674118593),
 ('num_nameservers', 0.009932276700031845),
 ('dom_age_days', 0.0775448165460135),
 ('dom_score', 0.22657044883649252),
 ('reg_score', 0.0),
 ('rank', 0.6281530708803031)]

In [17]:
# Review the output
output = pd.DataFrame({'domain': test.domain,
                       'assigned_trust': test.trust,
                       'prediction': predictions,
                       'correct': test.trust == predictions})

print(output.loc[output['correct'] == False])
num_false = len(output.loc[output['correct'] == False])
samp_size = len(output)
print('Number of samples: {}'.format(samp_size))
print('Number of false negatives: {}'.format(num_false))
print('Total correct: {}%'.format(1 - (num_false/samp_size)))

                        domain assigned_trust     prediction  correct
21932                pewtrusts           fake  initial trust    False
3915            nationalreview           fake  initial trust    False
18946       unofficialnetworks           fake  initial trust    False
21485           whatdoesitmean           fake  initial trust    False
22950  theeconomiccollapseblog           fake  initial trust    False
18580      investmentwatchblog           fake  initial trust    False
20196         americanprogress           fake  initial trust    False
19177             gellerreport           fake  initial trust    False
18607                x22report           fake  initial trust    False
187                  breitbart           fake  initial trust    False
16716            christianpost           fake  initial trust    False
4650                   pjmedia           fake  initial trust    False
18231           nutritionfacts           fake  initial trust    False
17916               

In [18]:
# Train the classifier
X = pd.get_dummies(train[features])
X_test = pd.get_dummies(validate[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

pd.crosstab(validate['trust'], predictions, rownames=['Actual trust'], colnames=['Predicted trust'])

Predicted trust,fake,initial trust
Actual trust,Unnamed: 1_level_1,Unnamed: 2_level_1
fake,434,16
initial trust,3,4607


In [19]:
# Review the output
output = pd.DataFrame({'domain': validate.domain,
                       'assigned_trust': validate.trust,
                       'prediction': predictions,
                       'correct': validate.trust == predictions})

print(output.loc[output['correct'] == False])
num_false = len(output.loc[output['correct'] == False])
samp_size = len(output)
print('Number of samples: {}'.format(samp_size))
print('Number of false negatives: {}'.format(num_false))
print('Total correct: {}%'.format(1 - (num_false/samp_size)))

                         domain assigned_trust     prediction  correct
20357                  alternet           fake  initial trust    False
22166           zambianobserver           fake  initial trust    False
17464               voltairenet           fake  initial trust    False
19456                    unilad           fake  initial trust    False
17536           nomadcapitalist           fake  initial trust    False
25296                altervista  initial trust           fake    False
368                          rt           fake  initial trust    False
17530              lifesitenews           fake  initial trust    False
17748             veteranstoday           fake  initial trust    False
21289         consciouslifenews           fake  initial trust    False
5822                   redstate           fake  initial trust    False
20732           100percentfedup           fake  initial trust    False
18587  theconservativetreehouse           fake  initial trust    False
25293 

In [20]:
(0.9948616600790514 + 0.9962450592885376)/2

0.9955533596837944

In [21]:
len(test)*2

10120