In [15]:
# import libraries and packages
import numpy as np
import pandas as pd
import time
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt


In [2]:
# read the data
df_data = pd.read_csv("data/user_fake_authentic_2class.csv")
# training features size: 65326 x 17
data_x = df_data.iloc[:,:-1]

# label types: r=real and f=fake
data_y = df_data.iloc[:,-1:]
# convert to 0:fake, 1:real
data_y = data_y.replace({'class':{"r": 1, "f":0}})

df_data

Unnamed: 0,pos,flw,flg,bl,pic,lin,cl,cz,ni,erl,erc,lt,hc,pr,fo,cs,pi,class
0,44,48,325,33,1,0,12,0.000000,0.000,0.000000,0.00,0.000,0.000,0.0,0.000,0.111111,0.094985,f
1,10,66,321,150,1,0,213,0.000000,1.000,14.390000,1.97,0.000,1.500,0.0,0.000,0.206826,230.412857,f
2,33,970,308,101,1,1,436,0.000000,1.000,10.100000,0.30,0.000,2.500,0.0,0.056,0.572174,43.569939,f
3,70,86,360,14,1,0,0,1.000000,0.000,0.780000,0.06,0.000,0.000,0.0,0.000,1.000000,5.859799,f
4,3,21,285,73,1,0,93,0.000000,0.000,14.290000,0.00,0.667,0.000,0.0,0.000,0.300494,0.126019,f
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65321,13,145,642,0,1,0,7,0.461538,0.000,14.270000,0.58,0.000,0.077,0.0,0.000,0.192308,1745.291260,r
65322,652,3000,1300,146,1,1,384,0.000000,0.389,8.520000,0.13,0.000,1.611,0.0,0.000,0.169917,54.629120,r
65323,1500,3700,3200,147,1,1,129,0.000000,0.111,9.390000,0.31,0.722,0.000,0.0,0.056,0.058908,129.802048,r
65324,329,1500,1800,218,1,1,290,0.055556,0.000,6.350000,0.26,0.222,0.500,0.0,0.000,0.103174,53.402840,r


In [24]:
# normalize 
norm_x = preprocessing.normalize(data_x)
norm_x = pd.DataFrame(norm_x, columns=data_x.columns)
norm_x["y_hat"] = np.ones(len(data_y.index))
norm_x

Unnamed: 0,pos,flw,flg,bl,pic,lin,cl,cz,ni,erl,erc,lt,hc,pr,fo,cs,pi,y_hat
0,0.132007,0.144008,0.975053,0.099005,0.003000,0.000000,0.036002,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000333,0.000285,1.0
1,0.020912,0.138019,0.671273,0.313679,0.002091,0.000000,0.445424,0.000000,0.002091,0.030092,0.004120,0.000000,0.003137,0.0,0.000000,0.000433,0.481838,1.0
2,0.029645,0.871381,0.276686,0.090731,0.000898,0.000898,0.391672,0.000000,0.000898,0.009073,0.000269,0.000000,0.002246,0.0,0.000050,0.000514,0.039140,1.0
3,0.185676,0.228116,0.954904,0.037135,0.002653,0.000000,0.000000,0.002653,0.000000,0.002069,0.000159,0.000000,0.000000,0.0,0.000000,0.002653,0.015543,1.0
4,0.009690,0.067827,0.920511,0.235780,0.003230,0.000000,0.300377,0.000000,0.000000,0.046155,0.000000,0.002154,0.000000,0.0,0.000000,0.000971,0.000407,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65321,0.006969,0.077732,0.344165,0.000000,0.000536,0.000000,0.003753,0.000247,0.000000,0.007650,0.000311,0.000000,0.000041,0.0,0.000000,0.000103,0.935621,1.0
65322,0.194071,0.892962,0.386950,0.043458,0.000298,0.000298,0.114299,0.000000,0.000116,0.002536,0.000039,0.000000,0.000480,0.0,0.000000,0.000051,0.016261,1.0
65323,0.292853,0.722370,0.624752,0.028700,0.000195,0.000195,0.025185,0.000000,0.000022,0.001833,0.000061,0.000141,0.000000,0.0,0.000011,0.000012,0.025342,1.0
65324,0.137409,0.626483,0.751780,0.091049,0.000418,0.000418,0.121120,0.000023,0.000000,0.002652,0.000109,0.000093,0.000209,0.0,0.000000,0.000043,0.022304,1.0


In [10]:
y_arr = data_y.values.flatten()

In [25]:
model = LogisticRegression(penalty="none", max_iter=10000)
model.fit(norm_x, y_arr)

LogisticRegression(max_iter=10000, penalty='none')

In [26]:
y_pred = model.predict(norm_x)
y_pred_probs = model.predict_proba(norm_x)[:,1]
y_pred_05 = np.array([1 if i>0.5 else 0 for i in y_pred_probs])
np.corrcoef(y_pred, y_pred_05)

array([[1., 1.],
       [1., 1.]])

In [27]:
accuracy_score(y_arr, y_pred_05)

0.78206227229587

In [35]:
model.feature_names_in_
model.coef_
pd.Series(model.coef_.flatten(), index = model.feature_names_in_)

pos        -0.074093
flw         3.652042
flg        -0.152145
bl          4.370258
pic      -140.782989
lin      1179.585295
cl          0.224226
cz        -26.294531
ni         81.712527
erl         4.744702
erc        29.958968
lt        264.306996
hc         34.759676
pr      -2875.106654
fo       -265.604226
cs       -201.123451
pi          2.767370
y_hat      -1.031736
dtype: float64