In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

import sklearn.metrics as metrics
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore")

# CTRL + SHIFT + P => Run Code 
import sys
print('Python', sys.version)
print(sklearn.__version__)

Python 3.7.0 (default, Jun 28 2018, 08:04:48) [MSC v.1912 64 bit (AMD64)]
0.20.3


### Results Collection

In [2]:
global ResultsCollection 
ResultsCollection = pd.DataFrame(columns=('Algorithm', 'FeatureSet', 'TestAcc', 'TestAUC'))

global testSize
testSize = 0.20

# Create Folder to save too

from datetime import datetime
import os

date = datetime.now().strftime("%Y%m%d-%H%M%S") + "\\"

global path
path = "D:\\0_MyFiles\\0_Libraries\\Documents\\Education\\University\\Year 3\\FYP_Git\\ParamSet\\" + str(date)

import os
#if not os.path.exists(path):
#    os.makedirs(path)

# Feature Set List

In [3]:
# Switch for multiple feautre sets
def getFeatureSet(number):
    return {
        1 : ['followers_count', 'friends_count', 'statuses_count', 'favourites_count', 'listed_count', 'verified', 'bot'],
        2 : ['followers_count', 'friends_count', 'statuses_count', 'favourites_count', 'listed_count_binary', 'verified', 
             'name_binary','description_binary', 'screen_name_binary','bot'],
        3 : ['followers_count', 'friends_count', 'statuses_count', 'favourites_count', 'listed_count', 'verified', 
             'name_binary','description_binarySTEM', 'screen_name_binary','bot'],        
        99 : ['followers_count', 'friends_count', 'statuses_count', 'favourites_count', 'listed_count', 'verified', 
              'name', 'screen_name', 'description', 'bot'],
    }[number]

# Import Training Data

In [4]:
# Read training data from csv
training_data = pd.read_csv('training_data.csv')
t2 = pd.read_csv('genuine_accounts_users.csv')
t3 = pd.read_csv('fake_followers_users.csv')
training_data['description'].fillna(' ', inplace=True)

features = getFeatureSet(99)

t2['bot'] = 0
t3['bot'] = 1

t2 = t2[features]
t3 = t3[features]
training_data = training_data[features]

new_data = t2.append(t3, ignore_index=True)
training_data = new_data.append(training_data, ignore_index=True)

training_data = training_data.sample(frac=1).reset_index(drop=True)
training_data['description'].fillna(' ', inplace=True)
training_data['verified'].fillna('0', inplace=True)

# Feature Development

### Stemming applied to description

In [5]:
def getTextDesc(desc):
    try:
        if isinstance(desc, float):
            return ''
        return desc
    except Exception as error:
        return ''
    
training_data['description'] = training_data.apply(lambda row: getTextDesc(row['description']), axis=1)

def stemLine(sentence):
    stemmer = nltk.PorterStemmer()
    translator=sentence.translate(str.maketrans("","", string.punctuation))
    translator = translator.lower()
    tokens = word_tokenize(translator)
    final = [stemmer.stem(tagged_word) for tagged_word in tokens]
    return " ".join(final)

training_data['descriptionStemmed'] = training_data['description'].apply(lambda row: stemLine(row))

### Binary True/False Conditions

In [6]:
#Binary
name_keywords = r'bot|b0t|papers'
stemmedKeywords = r'bot|b0t|random|#botally|creat|thi|time|//|botal|pubm|made|im|gener|day|everi|paper|follow|tweet|word'
keywords = r'bot|b0t|papers|#botally|follow|every|made|//|random|day|daily|tweet|tweets|made'

training_data['name_binary'] = training_data.name.str.contains(keywords, case=False, na=False)
training_data['screen_name_binary'] = training_data.screen_name.str.contains(keywords, case=False, na=False)

training_data['description_binarySTEM'] = training_data.description.str.contains(stemmedKeywords, case=False, na=False)

training_data['description_binary'] = training_data.description.str.contains(keywords, case=False, na=False)
#training_data['status_binary'] = training_data.status.str.contains(keywords, case=False, na=False)

training_data['listed_count_binary'] = (training_data.listed_count>20000)==False

### TFIDF

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
import scipy.sparse as sparse

training_data.description.fillna(' ')
training_data.screen_name.fillna(' ')

vectorizer = CountVectorizer()
CountVec = vectorizer.fit_transform(training_data.description.values.astype('U')).toarray()

vectorizer = CountVectorizer()
CountVecStemmed = vectorizer.fit_transform(training_data.descriptionStemmed.values.astype('U')).toarray()

# Global Classifier Functions

In [8]:
# Using Features create the test and traing X and Y
def initiateXY(features, testsize):
    # set X to all data fields and Y to bot field
    x = training_data[features].iloc[:, :-1] # x is all data
    y = training_data[features].iloc[:, -1] # y is bot bool
    return train_test_split(x, y, test_size=testsize, random_state=10)

# Get accuracy of a classifer
def get_results(classifer, x, y):
    ypredict = classifer.predict(x) # Predict Y value for all of X
    acc = accuracy_score(y, ypredict) # Find accuracy of prediction with the results 'Y'
    return acc

# Plot ROC Curve graph
def getAUC(clf, xtest, ytest, xtrain, ytrain, runName, feauteSet):  
    score_train = clf.predict_proba(xtrain)
    score_test = clf.predict_proba(xtest)
    
    y_sTrain = []
    y_sTest = []
    
    for i in range(len(score_train)):
        y_sTrain.append(score_train[i][1])

    for i in range(len(score_test)):
        y_sTest.append(score_test[i][1])
        
    fpr_train, tpr_train, _ = roc_curve(ytrain, y_sTrain, pos_label=1)
    fpr_test, tpr_test, _ = roc_curve(ytest, y_sTest, pos_label=1)
    
    return (auc(fpr_test, tpr_test))

# Calculate and print a classifiers accuracy
def testClassifer(classifier, xtrain, xtest, ytrain, ytest, printRoc, runName, feauteSet):
    acc_train = get_results(classifier, xtrain, ytrain)
    acc_test = get_results(classifier, xtest, ytest)
    
    if printRoc == True:
        AUC = getAUC(classifier, xtest, ytest, xtrain, ytrain, runName, feauteSet)
    
    print('Training Accuracy:  ', acc_train)
    print('Test Accuracy:  ', acc_test)
    print('AUC:', AUC)
    
    return acc_test, AUC

# Classifiers

### Logistic Regression

In [11]:
def run_RF(feauteSet, testsize, printRoc, runName):
    features = getFeatureSet(feauteSet)
    xtrain, xtest, ytrain, ytest = initiateXY(features, testsize)

    lr = LogisticRegression(random_state=10, solver='lbfgs',multi_class='multinomial')
    
    param_grid = {'C': [0.00001,0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000] }

    CV_rf = GridSearchCV(estimator=lr, param_grid=param_grid)  
    CV_rf.fit(xtrain, ytrain)  
    print(CV_rf.best_params_)
    
run_RF(3, testSize, True, 'Random Forest')

{'C': 1e-05}
