# 0. Initialize

## 0.1. Import Libraries

In [134]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os, sys, glob
import gzip
import random
import tqdm
import json
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)

from IPython import display
import matplotlib as mpl
from matplotlib import pyplot as plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 0.2. DEFINE VARIABLES 

In [136]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 0.3. Read Training & Evaluation Data

### 0.3.1. Get the labels for tweets

In [137]:
#trainingTweetDf = pd.read_csv('{}training-tweet.csv'.format(DATA_PATH))
trainingTweetDf = pd.read_csv('{}training-tweet.csv'.format(DATA_PATH), dtype={'tweet_id': str, 'isPolitical': str})
trainingTweetDf

Unnamed: 0,tweet_id,isPolitical
0,1597170281545551872,Yes
1,1431700027471192069,No
2,1566035577090281472,Yes
3,1591538690869940225,Yes
4,1583898169238167554,Yes
...,...,...
2995,1593539327623151619,Yes
2996,1393886554062524418,No
2997,1597925615092764672,Yes
2998,1585291418616176640,Yes


In [138]:
trainingTweetDf.isPolitical.value_counts()

Yes    2003
No      997
Name: isPolitical, dtype: int64

### 0.3.2. Get the labels for users

In [139]:
trainingUserDf = pd.read_csv('{}training-user.csv'.format(DATA_PATH))
#trainingUserDf = pd.read_csv('training-user.csv')
trainingUserDf

Unnamed: 0,screen_name,isBot
0,koftecancaddy,No
1,ahaber,No
2,selahat03949652,No
3,erdin06357062,No
4,bhct__necatii,No
...,...,...
2995,djblumenberg,No
2996,mel1sq,No
2997,eren_yz1,Yes
2998,ergnyildiz4,No


In [140]:
trainingUserDf.isBot.value_counts()

No     2424
Yes     576
Name: isBot, dtype: int64

# 1. EXTRACT FEATURES
Under *1.1. Political Tweet Detection* and *1.2. Bot Detection*, we firstly collect raw data for processing. We then combine some of them (total_interactions = num_favorites + num_retweets) or use them to extract features (whether the tweet has one of the political entities @meralaksener, @kilicdarogluk etc.).

In [142]:
PATH_TO_DOWNLOADED = DATA_PATH

## 1.1. Political Tweet Detection
This part stands for the feature extraction of tweets. We start with collecting the raw data from *tweet_metadata*, then use some of them to extract features.

### 1.1.1. Get Raw Data

#### 1.1.1.1. Check if Retweet

In [143]:
def check_if_retweet(tweet_metadata_line):
    is_retweet = 0
    retweeted_username = None

    try:
        tweet_metadata_line['retweeted_status']
        retweeted_username = tweet_metadata_line['retweeted_status']['user']['screen_name'].lower()
        is_retweet = 1

    except KeyError:
        pass

    return is_retweet, retweeted_username

#### 1.1.1.2. Get Tweet Text

In [144]:
def get_tweet_text(tweet_metadata_line):
    text = tweet_metadata_line['text']
    
    return text

#### 1.1.1.3. Get Tweet ID

In [145]:
def get_tweet_id(tweet_metadata_line):
    id_str = tweet_metadata_line['id_str']
    
    return id_str

#### 1.1.1.4. Get Number of Mentions and Hashtags

In [146]:
def get_number_mentions_hashtags(tweet_metadata_line):
    num_mentions = len(tweet_metadata_line['entities']['user_mentions'])
    num_hashtags = len(tweet_metadata_line['entities']['hashtags'])

    return num_mentions, num_hashtags

#### 1.1.1.5. Get Number of Retweets and Favorites

In [147]:
def get_number_retweets_favorites(tweet_metadata_line):
    retweet_count = tweet_metadata_line['retweet_count']
    favorite_count = tweet_metadata_line['favorite_count']
    
    return retweet_count, favorite_count

#### 1.1.1.6. Get User Info

In [148]:
def get_user_info(tweet_metadata_line):
    id = tweet_metadata_line['user']['id_str']
    screen_name = tweet_metadata_line['user']['screen_name'].lower()
    description = tweet_metadata_line['user']['description']

    return id, screen_name, description

### 1.1.2. Derive Manually Crafted Features

#### 1.1.2.1. Check for political entity in text

In [149]:
def check_political_ent(text):
    
    # the list below can be modified and some new names may be added (or removed)
    list_of_entities = ['meral_aksener', 'Meral Akşener', 'Meral', 'aksener', 'kilicdarogluk', 'Kemal Kılıçdaroğlu', 'Kılıçdaroğlu', 'kilicdaroglu', 'Kemal', 'vekilince', 'Muharrem İnce', 'RTErdogan','Recep Tayyip Erdoğan', 'Recep', 'Tayyip', 'Erdoğan', 'Erdogan', 'MevlutCavusoglu', 'Mevlüt Çavuşoğlu', 'umitozdag', 'umit ozdag', 'Ümit Özdağ', 'ekrem_imamoglu','Ekrem İmamoğlu',
    'Ekrem', 'İmamoğlu', 'imamoglu', 'ak parti', 'chp', 'cumhuriyet halk partisi', 'akp', 'iyi parti', 'iyiparti', 'Akparti', 'drfahrettinkoca', 'Fahrettin Koca', 'fahrettin', 'MHP_Bilgi', 'milliyetçi hareket partisi', 'MemleketimParti', 'milletvekil', 'mhp', 'Süleyman Soylu', 'suleymansoylu', 'suleyman soylu', 'Soylu', 'cumhurbaşkanı', 'hdp',
    'Gelecek Partisi', 'Zafer Partisi','Deva Partisi', 'Memleket Partisi', 'bakanı', 'NureddinNebati', 'Nurettin Nebati', 'tipgenelmerkez', 'HDPgenelmerkezi', 'hdpdemirtas', 'Selahattin Demirtaş', 'herkesicinCHP', 'alibabacan', 'Deva Partisi', 'devapartisi', 'Ali Babacan', 'Bahçeli', 'bahceli',
    'fuatoktay', 'Fuat Oktay', 'Mansur Yavaş', 'mansuryavas06', 'mansur', 'mansur yavas', 'faikoztrak', 'Faik Öztrak', 'enginaltaychp', 'Engin Altay', 'enginozkoc', 'Engin Özkoç', 'Ahmet Davutoğlu', 'Davutoğlu', 'Davutoglu', 'Ahmet_Davutoglu', 'ahmet davutoglu', 'GelecekPartiTR', 'BBahadirErdem', 'Bahadır Erdem', 'bahadir erdem', 'T_Karamollaoglu', 'Temel Karamollaoğlu',
    'SaadetPartisi', 'Saadet Partisi', 'zaferpartisi', 'akpartiistanbul', 'parti', 'af', 'öğretmen', 'politika', 'politik', 'meclis', 'belediye', 'kurul', 'dava', 'genel af', 'atama', 'mezun', 'faiz', 'ekonomi', 'yönet', '2023', 'seçim', 'muhalefet', 'dbdevletbahceli', 'bybekirbozdag', 'Bekir Bozdağ',
    'bekir bozdag', 'hapis', 'ceza', 'siyaset', 'aday', 'bakan', 'başkan', 'secim', 'baskan', 'oy', 'eyt', 'emekli', 'vedat bilgin', 'vedatbilgn', 'fetö', 'fethullah', 'fethullah gülen', 'fethullah gulen', 'özdağ', 'özdag', 'nato', 'mağdur', 'magdur',
    'cumhur', 'ittifak', 'atanma', 'memur', 'mağduriyet', 'vekil', 'deryayanikashb', 'derya yanik', 'derya yanık', 'terör', 'gurseltekin34', 'gürsel tekin', 'örgüt', 'aefakibaba', 'engin ozkoc', 'emine erdogan', 'emine erdoğan', 'EmineErdogan', 
    'suç', 'suc', 'yargi', 'yargı', 'ergenekon', 'Mustafa_Destici', 'sedat peker', 'başkan adayı', 'baskan adayi', 'varank', 'faruk çelik', 'farukcelikcomtr', 'LutfuTurkkan', 'lütfü türkkan', 'VahitKirisci', 'terörist', 'veliagbaba', 'YüzüncüYıla ErdoğanAffı', 'genelaf']
    
    
    entities_in_text = [ent for ent in list_of_entities if ent.lower() in text.lower()]
    number_entities = len(entities_in_text)

    return number_entities

In [150]:
# the list below can be modified and some new names may be added (or removed)
list_of_entities = ['meral_aksener', 'Meral Akşener', 'Meral', 'aksener', 'kilicdarogluk', 'Kemal Kılıçdaroğlu', 'Kılıçdaroğlu', 'kilicdaroglu', 'Kemal', 'vekilince', 'Muharrem İnce', 'RTErdogan','Recep Tayyip Erdoğan', 'Recep', 'Tayyip', 'Erdoğan', 'Erdogan', 'MevlutCavusoglu', 'Mevlüt Çavuşoğlu', 'umitozdag', 'umit ozdag', 'Ümit Özdağ', 'ekrem_imamoglu','Ekrem İmamoğlu',
    'Ekrem', 'İmamoğlu', 'imamoglu', 'ak parti', 'chp', 'cumhuriyet halk partisi', 'akp', 'iyi parti', 'iyiparti', 'Akparti', 'drfahrettinkoca', 'Fahrettin Koca', 'fahrettin', 'MHP_Bilgi', 'milliyetçi hareket partisi', 'MemleketimParti', 'milletvekil', 'mhp', 'Süleyman Soylu', 'suleymansoylu', 'suleyman soylu', 'Soylu', 'cumhurbaşkanı', 'hdp',
    'Gelecek Partisi', 'Zafer Partisi','Deva Partisi', 'Memleket Partisi', 'bakanı', 'NureddinNebati', 'Nurettin Nebati', 'tipgenelmerkez', 'HDPgenelmerkezi', 'hdpdemirtas', 'Selahattin Demirtaş', 'herkesicinCHP', 'alibabacan', 'Deva Partisi', 'devapartisi', 'Ali Babacan', 'Bahçeli', 'bahceli',
    'fuatoktay', 'Fuat Oktay', 'Mansur Yavaş', 'mansuryavas06', 'mansur', 'mansur yavas', 'faikoztrak', 'Faik Öztrak', 'enginaltaychp', 'Engin Altay', 'enginozkoc', 'Engin Özkoç', 'Ahmet Davutoğlu', 'Davutoğlu', 'Davutoglu', 'Ahmet_Davutoglu', 'ahmet davutoglu', 'GelecekPartiTR', 'BBahadirErdem', 'Bahadır Erdem', 'bahadir erdem', 'T_Karamollaoglu', 'Temel Karamollaoğlu',
    'SaadetPartisi', 'Saadet Partisi', 'zaferpartisi', 'akpartiistanbul', 'parti', 'af', 'öğretmen', 'politika', 'politik', 'meclis', 'belediye', 'kurul', 'dava', 'genel af', 'atama', 'mezun', 'faiz', 'ekonomi', 'yönet', '2023', 'seçim', 'muhalefet', 'dbdevletbahceli', 'bybekirbozdag', 'Bekir Bozdağ',
    'bekir bozdag', 'hapis', 'ceza', 'siyaset', 'aday', 'bakan', 'başkan', 'secim', 'baskan', 'oy', 'eyt', 'emekli', 'vedat bilgin', 'vedatbilgn', 'fetö', 'fethullah', 'fethullah gülen', 'fethullah gulen', 'özdağ', 'özdag', 'nato', 'mağdur', 'magdur',
    'cumhur', 'ittifak', 'atanma', 'memur', 'mağduriyet', 'vekil', 'deryayanikashb', 'derya yanik', 'derya yanık', 'terör', 'gurseltekin34', 'gürsel tekin', 'örgüt', 'aefakibaba', 'engin ozkoc', 'emine erdogan', 'emine erdoğan', 'EmineErdogan', 
    'suç', 'suc', 'yargi', 'yargı', 'ergenekon', 'Mustafa_Destici', 'sedat peker', 'başkan adayı', 'baskan adayi', 'varank', 'faruk çelik', 'farukcelikcomtr', 'LutfuTurkkan', 'lütfü türkkan', 'VahitKirisci', 'terörist', 'veliagbaba', 'YüzüncüYıla ErdoğanAffı', 'genelaf']
    
text="kilicdarogluk"
entities_in_text = [ent for ent in list_of_entities if ent.lower() in text.lower()]
entities_in_text

['kilicdarogluk', 'kilicdaroglu']

#### 1.1.2.2. Number of total interactions

In [151]:
def total_interactions(retweet_count, favorite_count):
    total_num_interactions = retweet_count + favorite_count
    
    return total_num_interactions

1.1.2.3. Number of political entities in the user's description

In [152]:
def political_entities_inDescription(text):
  # the list below can be modified and some new names may be added (or removed)
    list_of_entities = ['meral_aksener', 'Meral Akşener', 'Meral', 'aksener', 'kilicdarogluk', 'Kemal Kılıçdaroğlu', 'Kılıçdaroğlu', 'kilicdaroglu', 'Kemal', 'vekilince', 'Muharrem İnce', 'RTErdogan','Recep Tayyip Erdoğan', 'Recep', 'Tayyip', 'Erdoğan', 'Erdogan', 'MevlutCavusoglu', 'Mevlüt Çavuşoğlu', 'umitozdag', 'umit ozdag', 'Ümit Özdağ', 'ekrem_imamoglu','Ekrem İmamoğlu',
    'Ekrem', 'İmamoğlu', 'imamoglu', 'ak parti', 'chp', 'cumhuriyet halk partisi', 'akp', 'iyi parti', 'iyiparti', 'Akparti', 'drfahrettinkoca', 'Fahrettin Koca', 'fahrettin', 'MHP_Bilgi', 'milliyetçi hareket partisi', 'MemleketimParti', 'milletvekil', 'mhp', 'Süleyman Soylu', 'suleymansoylu', 'suleyman soylu', 'Soylu', 'cumhurbaşkanı', 'hdp',
    'Gelecek Partisi', 'Zafer Partisi','Deva Partisi', 'Memleket Partisi', 'bakanı', 'NureddinNebati', 'Nurettin Nebati', 'tipgenelmerkez', 'HDPgenelmerkezi', 'hdpdemirtas', 'Selahattin Demirtaş', 'herkesicinCHP', 'alibabacan', 'Deva Partisi', 'devapartisi', 'Ali Babacan', 'Bahçeli', 'bahceli',
    'fuatoktay', 'Fuat Oktay', 'Mansur Yavaş', 'mansuryavas06', 'mansur', 'mansur yavas', 'faikoztrak', 'Faik Öztrak', 'enginaltaychp', 'Engin Altay', 'enginozkoc', 'Engin Özkoç', 'Ahmet Davutoğlu', 'Davutoğlu', 'Davutoglu', 'Ahmet_Davutoglu', 'ahmet davutoglu', 'GelecekPartiTR', 'BBahadirErdem', 'Bahadır Erdem', 'bahadir erdem', 'T_Karamollaoglu', 'Temel Karamollaoğlu',
    'SaadetPartisi', 'Saadet Partisi', 'zaferpartisi', 'akpartiistanbul', 'parti', 'af', 'öğretmen', 'politika', 'politik', 'meclis', 'belediye', 'kurul', 'dava', 'genel af', 'atama', 'mezun', 'faiz', 'ekonomi', 'yönet', '2023', 'seçim', 'muhalefet', 'dbdevletbahceli', 'bybekirbozdag', 'Bekir Bozdağ',
    'bekir bozdag', 'hapis', 'ceza', 'siyaset', 'aday', 'bakan', 'başkan', 'secim', 'baskan', 'oy', 'eyt', 'emekli', 'vedat bilgin', 'vedatbilgn', 'fetö', 'fethullah', 'fethullah gülen', 'fethullah gulen', 'özdağ', 'özdag', 'nato', 'mağdur', 'magdur',
    'cumhur', 'ittifak', 'atanma', 'memur', 'mağduriyet', 'vekil', 'deryayanikashb', 'derya yanik', 'derya yanık', 'terör', 'gurseltekin34', 'gürsel tekin', 'örgüt', 'aefakibaba', 'engin ozkoc', 'emine erdogan', 'emine erdoğan', 'EmineErdogan', 
    'suç', 'suc', 'yargi', 'yargı', 'ergenekon', 'Mustafa_Destici', 'sedat peker', 'başkan adayı', 'baskan adayi', 'varank', 'faruk çelik', 'farukcelikcomtr', 'LutfuTurkkan', 'lütfü türkkan', 'VahitKirisci', 'terörist', 'veliagbaba', 'YüzüncüYıla ErdoğanAffı', 'genelaf']
    
    
    entities_in_text = [ent for ent in list_of_entities if ent.lower() in text.lower()]
    number_entities = len(entities_in_text)

    return number_entities

1.1.2.4. Did the user retweet the tweet from a politician

In [153]:
def political_retweet(text):
  # the list below can be modified and some new names may be added (or removed)
  list_of_entities = ['meral_aksener', 'Meral Akşener', 'Meral', 'aksener', 'kilicdarogluk', 'Kemal Kılıçdaroğlu', 'Kılıçdaroğlu', 'kilicdaroglu', 'Kemal', 'vekilince', 'Muharrem İnce', 'RTErdogan','Recep Tayyip Erdoğan', 'Recep', 'Tayyip', 'Erdoğan', 'Erdogan', 'MevlutCavusoglu', 'Mevlüt Çavuşoğlu', 'umitozdag', 'umit ozdag', 'Ümit Özdağ', 'ekrem_imamoglu','Ekrem İmamoğlu',
    'Ekrem', 'İmamoğlu', 'imamoglu', 'ak parti', 'chp', 'cumhuriyet halk partisi', 'akp', 'iyi parti', 'iyiparti', 'Akparti', 'drfahrettinkoca', 'Fahrettin Koca', 'fahrettin', 'MHP_Bilgi', 'milliyetçi hareket partisi', 'MemleketimParti', 'milletvekil', 'mhp', 'Süleyman Soylu', 'suleymansoylu', 'suleyman soylu', 'Soylu', 'cumhurbaşkanı', 'hdp',
    'Gelecek Partisi', 'Zafer Partisi','Deva Partisi', 'Memleket Partisi', 'bakanı', 'NureddinNebati', 'Nurettin Nebati', 'tipgenelmerkez', 'HDPgenelmerkezi', 'hdpdemirtas', 'Selahattin Demirtaş', 'herkesicinCHP', 'alibabacan', 'Deva Partisi', 'devapartisi', 'Ali Babacan', 'Bahçeli', 'bahceli',
    'fuatoktay', 'Fuat Oktay', 'Mansur Yavaş', 'mansuryavas06', 'mansur', 'mansur yavas', 'faikoztrak', 'Faik Öztrak', 'enginaltaychp', 'Engin Altay', 'enginozkoc', 'Engin Özkoç', 'Ahmet Davutoğlu', 'Davutoğlu', 'Davutoglu', 'Ahmet_Davutoglu', 'ahmet davutoglu', 'GelecekPartiTR', 'BBahadirErdem', 'Bahadır Erdem', 'bahadir erdem', 'T_Karamollaoglu', 'Temel Karamollaoğlu',
    'SaadetPartisi', 'Saadet Partisi', 'zaferpartisi', 'akpartiistanbul', 'parti', 'af', 'öğretmen', 'politika', 'politik', 'meclis', 'belediye', 'kurul', 'dava', 'genel af', 'atama', 'mezun', 'faiz', 'ekonomi', 'yönet', '2023', 'seçim', 'muhalefet', 'dbdevletbahceli', 'bybekirbozdag', 'Bekir Bozdağ',
    'bekir bozdag', 'hapis', 'ceza', 'siyaset', 'aday', 'bakan', 'başkan', 'secim', 'baskan', 'oy', 'eyt', 'emekli', 'vedat bilgin', 'vedatbilgn', 'fetö', 'fethullah', 'fethullah gülen', 'fethullah gulen', 'özdağ', 'özdag', 'nato', 'mağdur', 'magdur',
    'cumhur', 'ittifak', 'atanma', 'memur', 'mağduriyet', 'vekil', 'deryayanikashb', 'derya yanik', 'derya yanık', 'terör', 'gurseltekin34', 'gürsel tekin', 'örgüt', 'aefakibaba', 'engin ozkoc', 'emine erdogan', 'emine erdoğan', 'EmineErdogan', 
    'suç', 'suc', 'yargi', 'yargı', 'ergenekon', 'Mustafa_Destici', 'sedat peker', 'başkan adayı', 'baskan adayi', 'varank', 'faruk çelik', 'farukcelikcomtr', 'LutfuTurkkan', 'lütfü türkkan', 'VahitKirisci', 'terörist', 'veliagbaba', 'YüzüncüYıla ErdoğanAffı', 'genelaf']
  if(text!=None):
    entities_in_text = [ent for ent in list_of_entities if ent.lower() in text.lower()]
    if(len(entities_in_text)>=1):
      return True

  return False

1.1.2.5 Ratio of political entities in the tweet

In [154]:
def political_entities_ratio(text):
  # the list below can be modified and some new names may be added (or removed)
    list_of_entities = ['meral_aksener', 'Meral Akşener', 'Meral', 'aksener', 'kilicdarogluk', 'Kemal Kılıçdaroğlu', 'Kılıçdaroğlu', 'kilicdaroglu', 'Kemal', 'vekilince', 'Muharrem İnce', 'RTErdogan','Recep Tayyip Erdoğan', 'Recep', 'Tayyip', 'Erdoğan', 'Erdogan', 'MevlutCavusoglu', 'Mevlüt Çavuşoğlu', 'umitozdag', 'umit ozdag', 'Ümit Özdağ', 'ekrem_imamoglu','Ekrem İmamoğlu',
    'Ekrem', 'İmamoğlu', 'imamoglu', 'ak parti', 'chp', 'cumhuriyet halk partisi', 'akp', 'iyi parti', 'iyiparti', 'Akparti', 'drfahrettinkoca', 'Fahrettin Koca', 'fahrettin', 'MHP_Bilgi', 'milliyetçi hareket partisi', 'MemleketimParti', 'milletvekil', 'mhp', 'Süleyman Soylu', 'suleymansoylu', 'suleyman soylu', 'Soylu', 'cumhurbaşkanı', 'hdp',
    'Gelecek Partisi', 'Zafer Partisi','Deva Partisi', 'Memleket Partisi', 'bakanı', 'NureddinNebati', 'Nurettin Nebati', 'tipgenelmerkez', 'HDPgenelmerkezi', 'hdpdemirtas', 'Selahattin Demirtaş', 'herkesicinCHP', 'alibabacan', 'Deva Partisi', 'devapartisi', 'Ali Babacan', 'Bahçeli', 'bahceli',
    'fuatoktay', 'Fuat Oktay', 'Mansur Yavaş', 'mansuryavas06', 'mansur', 'mansur yavas', 'faikoztrak', 'Faik Öztrak', 'enginaltaychp', 'Engin Altay', 'enginozkoc', 'Engin Özkoç', 'Ahmet Davutoğlu', 'Davutoğlu', 'Davutoglu', 'Ahmet_Davutoglu', 'ahmet davutoglu', 'GelecekPartiTR', 'BBahadirErdem', 'Bahadır Erdem', 'bahadir erdem', 'T_Karamollaoglu', 'Temel Karamollaoğlu',
    'SaadetPartisi', 'Saadet Partisi', 'zaferpartisi', 'akpartiistanbul', 'parti', 'af', 'öğretmen', 'politika', 'politik', 'meclis', 'belediye', 'kurul', 'dava', 'genel af', 'atama', 'mezun', 'faiz', 'ekonomi', 'yönet', '2023', 'seçim', 'muhalefet', 'dbdevletbahceli', 'bybekirbozdag', 'Bekir Bozdağ',
    'bekir bozdag', 'hapis', 'ceza', 'siyaset', 'aday', 'bakan', 'başkan', 'secim', 'baskan', 'oy', 'eyt', 'emekli', 'vedat bilgin', 'vedatbilgn', 'fetö', 'fethullah', 'fethullah gülen', 'fethullah gulen', 'özdağ', 'özdag', 'nato', 'mağdur', 'magdur',
    'cumhur', 'ittifak', 'atanma', 'memur', 'mağduriyet', 'vekil', 'deryayanikashb', 'derya yanik', 'derya yanık', 'terör', 'gurseltekin34', 'gürsel tekin', 'örgüt', 'aefakibaba', 'engin ozkoc', 'emine erdogan', 'emine erdoğan', 'EmineErdogan', 
    'suç', 'suc', 'yargi', 'yargı', 'ergenekon', 'Mustafa_Destici', 'sedat peker', 'başkan adayı', 'baskan adayi', 'varank', 'faruk çelik', 'farukcelikcomtr', 'LutfuTurkkan', 'lütfü türkkan', 'VahitKirisci', 'terörist', 'veliagbaba', 'YüzüncüYıla ErdoğanAffı', 'genelaf']
    
    
    entities_in_text = [ent for ent in list_of_entities if ent.lower() in text.lower()]
    number_entities = len(entities_in_text)

    return (len(entities_in_text)/len(text))

1.1.2.6 Did the user mention any political entities?

In [155]:
def political_entities_mentioned(tweet_metadata):
  list_of_entities = ['meral_aksener', 'Meral Akşener', 'Meral', 'aksener', 'kilicdarogluk', 'Kemal Kılıçdaroğlu', 'Kılıçdaroğlu', 'kilicdaroglu', 'Kemal', 'vekilince', 'Muharrem İnce', 'RTErdogan','Recep Tayyip Erdoğan', 'Recep', 'Tayyip', 'Erdoğan', 'Erdogan', 'MevlutCavusoglu', 'Mevlüt Çavuşoğlu', 'umitozdag', 'umit ozdag', 'Ümit Özdağ', 'ekrem_imamoglu','Ekrem İmamoğlu',
    'Ekrem', 'İmamoğlu', 'imamoglu', 'ak parti', 'chp', 'cumhuriyet halk partisi', 'akp', 'iyi parti', 'iyiparti', 'Akparti', 'drfahrettinkoca', 'Fahrettin Koca', 'fahrettin', 'MHP_Bilgi', 'milliyetçi hareket partisi', 'MemleketimParti', 'milletvekil', 'mhp', 'Süleyman Soylu', 'suleymansoylu', 'suleyman soylu', 'Soylu', 'cumhurbaşkanı', 'hdp',
    'Gelecek Partisi', 'Zafer Partisi','Deva Partisi', 'Memleket Partisi', 'bakanı', 'NureddinNebati', 'Nurettin Nebati', 'tipgenelmerkez', 'HDPgenelmerkezi', 'hdpdemirtas', 'Selahattin Demirtaş', 'herkesicinCHP', 'alibabacan', 'Deva Partisi', 'devapartisi', 'Ali Babacan', 'Bahçeli', 'bahceli',
    'fuatoktay', 'Fuat Oktay', 'Mansur Yavaş', 'mansuryavas06', 'mansur', 'mansur yavas', 'faikoztrak', 'Faik Öztrak', 'enginaltaychp', 'Engin Altay', 'enginozkoc', 'Engin Özkoç', 'Ahmet Davutoğlu', 'Davutoğlu', 'Davutoglu', 'Ahmet_Davutoglu', 'ahmet davutoglu', 'GelecekPartiTR', 'BBahadirErdem', 'Bahadır Erdem', 'bahadir erdem', 'T_Karamollaoglu', 'Temel Karamollaoğlu',
    'SaadetPartisi', 'Saadet Partisi', 'zaferpartisi', 'akpartiistanbul', 'parti', 'af', 'öğretmen', 'politika', 'politik', 'meclis', 'belediye', 'kurul', 'dava', 'genel af', 'atama', 'mezun', 'faiz', 'ekonomi', 'yönet', '2023', 'seçim', 'muhalefet', 'dbdevletbahceli', 'bybekirbozdag', 'Bekir Bozdağ',
    'bekir bozdag', 'hapis', 'ceza', 'siyaset', 'aday', 'bakan', 'başkan', 'secim', 'baskan', 'oy', 'eyt', 'emekli', 'vedat bilgin', 'vedatbilgn', 'fetö', 'fethullah', 'fethullah gülen', 'fethullah gulen', 'özdağ', 'özdag', 'nato', 'mağdur', 'magdur',
    'cumhur', 'ittifak', 'atanma', 'memur', 'mağduriyet', 'vekil', 'deryayanikashb', 'derya yanik', 'derya yanık', 'terör', 'gurseltekin34', 'gürsel tekin', 'örgüt', 'aefakibaba', 'engin ozkoc', 'emine erdogan', 'emine erdoğan', 'EmineErdogan', 
    'suç', 'suc', 'yargi', 'yargı', 'ergenekon', 'Mustafa_Destici', 'sedat peker', 'başkan adayı', 'baskan adayi', 'varank', 'faruk çelik', 'farukcelikcomtr', 'LutfuTurkkan', 'lütfü türkkan', 'VahitKirisci', 'terörist', 'veliagbaba', 'YüzüncüYıla ErdoğanAffı', 'genelaf']
    
  list_len=len(tweet_metadata['entities']['user_mentions'])
  i=0
  number_entities=0
  if(tweet_metadata['entities']['user_mentions']!=[]):  
    while(i<list_len):  
      entities_in_text1 = [ent for ent in list_of_entities if ent.lower() in tweet_metadata['entities']['user_mentions'][i]['screen_name'].lower()]
      entities_in_text2 = [ent for ent in list_of_entities if ent.lower() in tweet_metadata['entities']['user_mentions'][i]['name'].lower()] 
      number_entities = number_entities+(len(entities_in_text1)+len(entities_in_text2))    
      i=i+1
  if(number_entities==0):
    return False
  else:
    return True

1.1.2.7 How many political entities did the user mention?

In [156]:
def num_political_entities_mentioned(tweet_metadata):
  list_of_entities = ['meral_aksener', 'Meral Akşener', 'Meral', 'aksener', 'kilicdarogluk', 'Kemal Kılıçdaroğlu', 'Kılıçdaroğlu', 'kilicdaroglu', 'Kemal', 'vekilince', 'Muharrem İnce', 'RTErdogan','Recep Tayyip Erdoğan', 'Recep', 'Tayyip', 'Erdoğan', 'Erdogan', 'MevlutCavusoglu', 'Mevlüt Çavuşoğlu', 'umitozdag', 'umit ozdag', 'Ümit Özdağ', 'ekrem_imamoglu','Ekrem İmamoğlu',
    'Ekrem', 'İmamoğlu', 'imamoglu', 'ak parti', 'chp', 'cumhuriyet halk partisi', 'akp', 'iyi parti', 'iyiparti', 'Akparti', 'drfahrettinkoca', 'Fahrettin Koca', 'fahrettin', 'MHP_Bilgi', 'milliyetçi hareket partisi', 'MemleketimParti', 'milletvekil', 'mhp', 'Süleyman Soylu', 'suleymansoylu', 'suleyman soylu', 'Soylu', 'cumhurbaşkanı', 'hdp',
    'Gelecek Partisi', 'Zafer Partisi','Deva Partisi', 'Memleket Partisi', 'bakanı', 'NureddinNebati', 'Nurettin Nebati', 'tipgenelmerkez', 'HDPgenelmerkezi', 'hdpdemirtas', 'Selahattin Demirtaş', 'herkesicinCHP', 'alibabacan', 'Deva Partisi', 'devapartisi', 'Ali Babacan', 'Bahçeli', 'bahceli',
    'fuatoktay', 'Fuat Oktay', 'Mansur Yavaş', 'mansuryavas06', 'mansur', 'mansur yavas', 'faikoztrak', 'Faik Öztrak', 'enginaltaychp', 'Engin Altay', 'enginozkoc', 'Engin Özkoç', 'Ahmet Davutoğlu', 'Davutoğlu', 'Davutoglu', 'Ahmet_Davutoglu', 'ahmet davutoglu', 'GelecekPartiTR', 'BBahadirErdem', 'Bahadır Erdem', 'bahadir erdem', 'T_Karamollaoglu', 'Temel Karamollaoğlu',
    'SaadetPartisi', 'Saadet Partisi', 'zaferpartisi', 'akpartiistanbul', 'parti', 'af', 'öğretmen', 'politika', 'politik', 'meclis', 'belediye', 'kurul', 'dava', 'genel af', 'atama', 'mezun', 'faiz', 'ekonomi', 'yönet', '2023', 'seçim', 'muhalefet', 'dbdevletbahceli', 'bybekirbozdag', 'Bekir Bozdağ',
    'bekir bozdag', 'hapis', 'ceza', 'siyaset', 'aday', 'bakan', 'başkan', 'secim', 'baskan', 'oy', 'eyt', 'emekli', 'vedat bilgin', 'vedatbilgn', 'fetö', 'fethullah', 'fethullah gülen', 'fethullah gulen', 'özdağ', 'özdag', 'nato', 'mağdur', 'magdur',
    'cumhur', 'ittifak', 'atanma', 'memur', 'mağduriyet', 'vekil', 'deryayanikashb', 'derya yanik', 'derya yanık', 'terör', 'gurseltekin34', 'gürsel tekin', 'örgüt', 'aefakibaba', 'engin ozkoc', 'emine erdogan', 'emine erdoğan', 'EmineErdogan', 
    'suç', 'suc', 'yargi', 'yargı', 'ergenekon', 'Mustafa_Destici', 'sedat peker', 'başkan adayı', 'baskan adayi', 'varank', 'faruk çelik', 'farukcelikcomtr', 'LutfuTurkkan', 'lütfü türkkan', 'VahitKirisci', 'terörist', 'veliagbaba', 'YüzüncüYıla ErdoğanAffı', 'genelaf']
  list_len=len(tweet_metadata['entities']['user_mentions'])
  i=0
  number_entities=0
  if(tweet_metadata['entities']['user_mentions']!=[]):  
    while(i<list_len):  
      entities_in_text1 = [ent for ent in list_of_entities if ent.lower() in tweet_metadata['entities']['user_mentions'][i]['screen_name'].lower()]
      entities_in_text2 = [ent for ent in list_of_entities if ent.lower() in tweet_metadata['entities']['user_mentions'][i]['name'].lower()]
      number_entities = number_entities+(len(entities_in_text1)+len(entities_in_text2))
      i=i+1
      
  return number_entities

1.1.2.8 Total mentions - political mentions ratio

In [157]:
def political_entities_mentioned_ratio(total_men,political_men):
  if(total_men!=0):
    return (political_men/(2*total_men))
  else:
    return 0

1.1.2.9 How many times did the political entities were used in the tweet?

In [158]:
def num_pol_ent_used(text):
  # the list below can be modified and some new names may be added (or removed)
  list_of_entities = ['meral_aksener', 'Meral Akşener', 'Meral', 'aksener', 'kilicdarogluk', 'Kemal Kılıçdaroğlu', 'Kılıçdaroğlu', 'kilicdaroglu', 'Kemal', 'vekilince', 'Muharrem İnce', 'RTErdogan','Recep Tayyip Erdoğan', 'Recep', 'Tayyip', 'Erdoğan', 'Erdogan', 'MevlutCavusoglu', 'Mevlüt Çavuşoğlu', 'umitozdag', 'umit ozdag', 'Ümit Özdağ', 'ekrem_imamoglu','Ekrem İmamoğlu',
    'Ekrem', 'İmamoğlu', 'imamoglu', 'ak parti', 'chp', 'cumhuriyet halk partisi', 'akp', 'iyi parti', 'iyiparti', 'Akparti', 'drfahrettinkoca', 'Fahrettin Koca', 'fahrettin', 'MHP_Bilgi', 'milliyetçi hareket partisi', 'MemleketimParti', 'milletvekil', 'mhp', 'Süleyman Soylu', 'suleymansoylu', 'suleyman soylu', 'Soylu', 'cumhurbaşkanı', 'hdp',
    'Gelecek Partisi', 'Zafer Partisi','Deva Partisi', 'Memleket Partisi', 'bakanı', 'NureddinNebati', 'Nurettin Nebati', 'tipgenelmerkez', 'HDPgenelmerkezi', 'hdpdemirtas', 'Selahattin Demirtaş', 'herkesicinCHP', 'alibabacan', 'Deva Partisi', 'devapartisi', 'Ali Babacan', 'Bahçeli', 'bahceli',
    'fuatoktay', 'Fuat Oktay', 'Mansur Yavaş', 'mansuryavas06', 'mansur', 'mansur yavas', 'faikoztrak', 'Faik Öztrak', 'enginaltaychp', 'Engin Altay', 'enginozkoc', 'Engin Özkoç', 'Ahmet Davutoğlu', 'Davutoğlu', 'Davutoglu', 'Ahmet_Davutoglu', 'ahmet davutoglu', 'GelecekPartiTR', 'BBahadirErdem', 'Bahadır Erdem', 'bahadir erdem', 'T_Karamollaoglu', 'Temel Karamollaoğlu',
    'SaadetPartisi', 'Saadet Partisi', 'zaferpartisi', 'akpartiistanbul', 'parti', 'af', 'öğretmen', 'politika', 'politik', 'meclis', 'belediye', 'kurul', 'dava', 'genel af', 'atama', 'mezun', 'faiz', 'ekonomi', 'yönet', '2023', 'seçim', 'muhalefet', 'dbdevletbahceli', 'bybekirbozdag', 'Bekir Bozdağ',
    'bekir bozdag', 'hapis', 'ceza', 'siyaset', 'aday', 'bakan', 'başkan', 'secim', 'baskan', 'oy', 'eyt', 'emekli', 'vedat bilgin', 'vedatbilgn', 'fetö', 'fethullah', 'fethullah gülen', 'fethullah gulen', 'özdağ', 'özdag', 'nato', 'mağdur', 'magdur',
    'cumhur', 'ittifak', 'atanma', 'memur', 'mağduriyet', 'vekil', 'deryayanikashb', 'derya yanik', 'derya yanık', 'terör', 'gurseltekin34', 'gürsel tekin', 'örgüt', 'aefakibaba', 'engin ozkoc', 'emine erdogan', 'emine erdoğan', 'EmineErdogan', 
    'suç', 'suc', 'yargi', 'yargı', 'ergenekon', 'Mustafa_Destici', 'sedat peker', 'başkan adayı', 'baskan adayi', 'varank', 'faruk çelik', 'farukcelikcomtr', 'LutfuTurkkan', 'lütfü türkkan', 'VahitKirisci', 'terörist', 'veliagbaba', 'YüzüncüYıla ErdoğanAffı','genelaf']
  text = text.lower()    
  count = 0
  list_of_entities = [word.lower() for word in list_of_entities]
  for word in list_of_entities:
    count += text.count(word)
  return count

### 1.1.2. Collect data using the functions above and transform into a Pandas DataFrame

In [159]:
dfPolitical = {'tweet_id':[],
              'is_retweet':[],
              'retweeted_username':[],
              'text':[],
              'num_mentions':[],
              'num_hashtags':[],
              'num_retweets':[],
              'num_favorites':[],
              'user_id':[],
              'user_screen_name':[],
              'user_description':[],
              'num_political_entities':[],
              'num_political_entities_inDescription':[],
              'retweeted_political': [],
              'total_interactions':[],
              'political_ratio':[],
              'num_user_mention_political':[],
              'user_mention_political':[],
              'political_entities_men_ratio':[],
              'num_political_ent_used':[]
               }

with gzip.open(f"{PATH_TO_DOWNLOADED}tweet_metadata.jsons.gz", "rb") as f:

    for line in f:
        line = json.loads(line)
        # raw data:
        id_str = get_tweet_id(line)
        is_retweet, retweeted_username = check_if_retweet(line)
        text = get_tweet_text(line)
        num_mentions, num_hashtags = get_number_mentions_hashtags(line)
        retweet_count, favorite_count = get_number_retweets_favorites(line)
        user_id_str, screen_name, user_description = get_user_info(line)

        # manually crafted data:
        num_political_entities = check_political_ent(text)
        total_num_interactions = total_interactions(retweet_count, favorite_count)
        retweeted_political = political_retweet(retweeted_username)
        num_political_entities_inDescription = political_entities_inDescription(user_description)
        political_ratio = political_entities_ratio(text)
        num_user_mention_political = num_political_entities_mentioned(line)
        user_mention_political = political_entities_mentioned(line)
        political_entities_men_ratio = political_entities_mentioned_ratio(num_mentions,num_user_mention_political)
        num_political_ent_used = num_pol_ent_used(text)

        dfPolitical['tweet_id'].append(id_str)
        dfPolitical['is_retweet'].append(is_retweet)
        dfPolitical['retweeted_username'].append(retweeted_username)
        dfPolitical['text'].append(text)
        dfPolitical['num_mentions'].append(num_mentions)
        dfPolitical['num_hashtags'].append(num_hashtags)
        dfPolitical['num_retweets'].append(retweet_count)
        dfPolitical['num_favorites'].append(favorite_count)
        dfPolitical['user_id'].append(user_id_str)
        dfPolitical['user_screen_name'].append(screen_name)
        dfPolitical['user_description'].append(user_description)
        dfPolitical['num_political_entities'].append(num_political_entities)
        dfPolitical['num_political_entities_inDescription'].append(num_political_entities_inDescription)
        dfPolitical['retweeted_political'].append(retweeted_political)
        dfPolitical['total_interactions'].append(total_num_interactions)
        dfPolitical['political_ratio'].append(political_ratio)
        dfPolitical['num_user_mention_political'].append(num_user_mention_political)
        dfPolitical['user_mention_political'].append(user_mention_political)
        dfPolitical['political_entities_men_ratio'].append(political_entities_men_ratio)
        dfPolitical['num_political_ent_used'].append(num_political_ent_used)

In [160]:
dfPolitical = pd.DataFrame(dfPolitical)
dfPolitical

Unnamed: 0,tweet_id,is_retweet,retweeted_username,text,num_mentions,num_hashtags,num_retweets,num_favorites,user_id,user_screen_name,user_description,num_political_entities,num_political_entities_inDescription,retweeted_political,total_interactions,political_ratio,num_user_mention_political,user_mention_political,political_entities_men_ratio,num_political_ent_used
0,1588568792984346624,0,,"Sosyal Hizmetin temelini çocuk oluşturur,çocuğ...",0,0,49,98,920963718103650304,maviruh_,shu/\nburaya afilli bir söz yazdığımı varsayın,0,1,False,147,0.000000,0,False,0.000000,0
1,1588452263047069697,0,,"@mahirunal Gavur İzmir ya onlar, hani Cumhuriy...",1,0,0,0,595514060,mtfdan,,1,0,False,0,0.010309,0,False,0.000000,1
2,1569589330544398336,0,,#ŞehitAdayıUzmÇvşaKadro\nSiz İstesenizde Istem...,0,1,0,0,1356375754561490947,ahsucilginuzman,Vatan Sevdalisi,1,0,False,0,0.007143,0,False,0.000000,1
3,1570428119609139201,0,,@ajans_muhbir Siz kaypak olmayıp onay vermesey...,1,0,0,0,1478775431008595968,hamitelkelle,HighOne,1,0,False,0,0.007143,0,False,0.000000,1
4,1551163840368414722,0,,Engelli öğretmenler olarak önümüzdeki engeller...,0,0,0,0,1511976696337113088,sed58417690,,2,0,False,0,0.014286,0,False,0.000000,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33528,1568595408233832448,0,,Gerçek kimlik taşımayan hesaplara cevap vermem...,0,0,9,81,576247173,ardanzenturk,RT ONAYLADIĞIM ANLAMINA GELMEZ\nArtık fikirler...,0,1,False,90,0.000000,0,False,0.000000,0
33529,1584027427696959488,0,,@umitozdag Neden Suriyelilerle ilgili bu kadar...,1,0,1,8,162308585,ozgul_61,Bridge design engineer Yaay hesabı : dilfiruz,1,0,False,9,0.007143,3,True,1.500000,1
33530,1585945783307730945,0,,@celebimehmeta Niye Türkiye yüzyılıda.Türkiye ...,1,0,0,1,415025519,ladrekova,,1,0,False,1,0.013158,0,False,0.000000,1
33531,1569748909521801221,1,muazzezeralp,RT @muazzezeralp: @Doan58213655 @denizkonur @N...,7,1,6,0,1442125177727307781,yapikytgrivrlsn,,4,0,False,6,0.028571,9,True,0.642857,6


## 1.2. From Users

### 1.2.1. Get user metadata from user_profiles.jsons.gz

#### 1.2.1.1. Get user info metadata

In [161]:
def get_user_info_metadata(user_metadata_line):
    
    user_id = user_metadata_line['id_str']
    user_name = user_metadata_line['name']
    user_screen_name = user_metadata_line['screen_name'].lower()
    user_location = user_metadata_line['location']
    user_description = user_metadata_line['description']
    user_followers_count = user_metadata_line['followers_count']
    user_friends_count = user_metadata_line['friends_count']
    user_verified = user_metadata_line['verified']
    user_default_photo = user_metadata_line['default_profile_image']

    dictionary = {'user_id':user_id, 'user_name': user_name, 'user_screen_name':user_screen_name, 'user_location':user_location,
     'user_description':user_description, 'user_followers_count':user_followers_count, 'user_friends_count':user_friends_count, 'user_verified':user_verified, 'user_default_photo':user_default_photo}

    return dictionary

#### 1.2.1.2. Get followers/(followers+friends) ratio

In [162]:
def get_followers_all_ratio(user_followers_count, user_friends_count):
    
    if user_friends_count + user_followers_count == 0:
        followers_all_ratio = 0

    else:
        followers_all_ratio =  user_followers_count / (user_friends_count + user_followers_count)

    return followers_all_ratio

#### 1.2.1.3. Get description length

In [163]:
def get_desc_len(user_description):
    
    description_len = len(user_description)

    return description_len

In [164]:
dfBot = {'user_id':[],
         'user_name':[],
         'user_screen_name':[],
         'user_location':[],
         'user_description':[],
         'user_followers_count':[],
         'user_friends_count':[],
         'description_len':[],
         'followers_to_all_ratio':[],
         'user_default_photo':[],
         'user_verified':[]}

with gzip.open(f"{PATH_TO_DOWNLOADED}user_profiles.jsons.gz", "rb") as f:
    for line in f:
        line = json.loads(line)

        dictionary = get_user_info_metadata(line)
        for k,v in dictionary.items():
            dfBot[k].append(v)

        
        # manually crafted data:
        description_len = get_desc_len(dictionary['user_description'])
        dfBot['description_len'].append(description_len)
        
        followers_all_ratio = get_followers_all_ratio(dictionary['user_followers_count'], 
                                                      dictionary['user_friends_count'])

        dfBot['followers_to_all_ratio'].append(followers_all_ratio)

In [165]:
dfBot = pd.DataFrame(dfBot)
dfBot

Unnamed: 0,user_id,user_name,user_screen_name,user_location,user_description,user_followers_count,user_friends_count,description_len,followers_to_all_ratio,user_default_photo,user_verified
0,1431241870848450577,Nasreena Khan Wazir,nasreenakhan006,"Islamabad, Pakistan",Student,65,185,7,0.260000,False,False
1,1304340303080386560,fania :((((,scorpiehoez,bogor,have a holly jolly🎄,8235,3011,19,0.732260,False,False
2,1116042038577958914,Yusuf Aksoy,yusufak63712920,,"Bir şeyden pişmanlık duymak istemiyorsan,her ş...",95,399,64,0.192308,False,False
3,4859899931,Be (VIXX6) ama oppalarının düğününe gidemiyor,nedenburdaysam,Hufflepuff ortak salon,"SMStan\n/St☆rlight ///come on girls,this is ou...",40,83,65,0.325203,False,False
4,2225373636,SLMDMR,biologselim,,BİYOLOG🔬🦠\nNanoteknoloji,100,98,23,0.505051,False,False
...,...,...,...,...,...,...,...,...,...,...,...
29665,1320834618220781569,islammm,islam_mrsj,,,229,217,0,0.513453,False,False
29666,111074128,Melda Onur,meldaonur,,"Şekersiz çay, etsiz sofra, SAVAŞSIZ dünya... 🐌...",212457,5428,100,0.975088,False,False
29667,36946875,ali ydm,ali_ydm,"İstanbul, Türkiye",hayat oyunsa bende jeton çok,116,141,28,0.451362,False,False
29668,2389587396,Türkan Usta,turkanusta,"Ankara, Türkiye",Ustaya sormuşlar; hayatta yaptığın en büyük is...,1669,3639,116,0.314431,False,False


### 1.2.2. Get Tweet Info of Users in user_profiles.jsons.gz

#### 1.2.2.1. Check ratio of retweets to all tweets

In [166]:
def get_retweet_tweet_ratio(line):
    number_retweets = 0
    number_original_tweets = 0

    for tweet in line['tweets']:
        try:
            tweet['retweeted_status']
            number_retweets += 1
                
        except:
            number_original_tweets += 1
            
    total_tweets = number_retweets + number_original_tweets
    
    if total_tweets == 0:
        retweet_total_ratio = None
    else:
        retweet_total_ratio = number_retweets/(total_tweets)
    
    return [retweet_total_ratio,number_retweets]

#### 1.2.2.2. Check median number of favorites

In [167]:
def get_median_number_favorites(line):
    num_median_favorites = np.median([tweet['favorite_count'] for tweet in line['tweets']])

    return num_median_favorites

### 1.2.3. Collect data using the functions above and transform into a Pandas DataFrame

In [168]:
dfBotTweets = {'user_id':[],
               'retweet_total_ratio':[],
               'num_median_favorites':[],
               'num_of_retweets':[]
              }

i = 0

with gzip.open(f"{PATH_TO_DOWNLOADED}user_tweets.jsons.gz", "rb") as f:
    for line in f:

        line = json.loads(line)

        user_id = line['user_id']
        dfBotTweets['user_id'].append(user_id)
        
        retweet_total_ratio = get_retweet_tweet_ratio(line)[0]
        dfBotTweets['retweet_total_ratio'].append(retweet_total_ratio)
        
        num_median_favorites = get_median_number_favorites(line)
        dfBotTweets['num_median_favorites'].append(num_median_favorites)
        
        num_of_retweets = get_retweet_tweet_ratio(line)[1]
        if(num_of_retweets is None):
          break
        else:
          dfBotTweets['num_of_retweets'].append(num_of_retweets)

        i += 1
        if i % 1000 == 0:
            print(i)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000


In [169]:
dfBotTweets = pd.DataFrame(dfBotTweets)
dfBotTweets

Unnamed: 0,user_id,retweet_total_ratio,num_median_favorites,num_of_retweets
0,594642154,0.115000,2.0,23
1,525600289,0.005025,1.0,1
2,931895965501534209,0.900000,0.0,180
3,1591543462746329088,0.185000,0.0,37
4,734801354749796352,1.000000,0.0,200
...,...,...,...,...
28310,1591370361488252928,0.800000,0.0,160
28311,1475272459616235525,0.825000,0.0,165
28312,1096753792731750401,0.051020,1.0,10
28313,1269527617687953409,0.095000,2.0,19


### 1.2.3. Merge dfBot and dfBotTweets

In [170]:
dfBotAll = dfBot.merge(dfBotTweets,
                       how='left')

dfBotAll[['retweet_total_ratio', 'num_median_favorites']] = dfBotAll[['retweet_total_ratio', 'num_median_favorites']].fillna(0)

dfBotAll['num_of_retweets']=dfBotAll['num_of_retweets'].fillna(0)

# 2. TRAIN MODEL

## 2.1. Political Tweet Prediction

### 2.1.1. Merge dfPolitical data with labels

In [171]:
dfPoliticalAll_train = dfPolitical.merge(trainingTweetDf,
                                         on='tweet_id')

dfPoliticalAll_train

Unnamed: 0,tweet_id,is_retweet,retweeted_username,text,num_mentions,num_hashtags,num_retweets,num_favorites,user_id,user_screen_name,user_description,num_political_entities,num_political_entities_inDescription,retweeted_political,total_interactions,political_ratio,num_user_mention_political,user_mention_political,political_entities_men_ratio,num_political_ent_used,isPolitical
0,1585955683513798656,0,,@AvOzlemZengin YüzüncüYıla YakışanGenelAf adli...,1,0,3,2,1564992353168941058,zehra78231638,,3,0,False,5,0.023810,0,False,0.000000,5,Yes
1,1597631718479261696,0,,#TCYüzyılıÜcretliÖgrtKadro\n#TCYüzyılıÜcretliÖ...,0,2,30,28,1324630334416297985,nurozguler,,3,0,False,58,0.021429,0,False,0.000000,4,Yes
2,1572522789948751874,0,,Ekrem İmamoğlu davayı değerlendirdi. 'Boş işle...,0,0,5,66,407597071,onediocom,Türkiye'nin ilk ve tek sosyal içerik sitesi ht...,4,0,False,71,0.030534,0,False,0.000000,4,Yes
3,1591412481561624577,0,,Sayın Bakanım @suleymansoylu POMEM önlisans er...,1,0,0,0,1394789887073738753,buckybarnestr,...,6,0,False,0,0.042857,6,True,3.000000,7,Yes
4,1596914274907348992,0,,"@varank Sayın bakanım, Bodrumdaki bu araziyi ...",1,0,0,0,1586083256088371201,sayariahmet,,3,0,False,0,0.034483,3,True,1.500000,3,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,1565798698029924353,0,,@dbdevletbahceli Şoför,1,0,0,0,1373715344301391882,ahmetazrak8,,2,0,False,0,0.090909,3,True,1.500000,2,No
2996,1586019337248309248,0,,Türkiyenin geleceği ücretli öğretmenliğe emane...,4,0,0,0,1583586562817785860,taskinali003,,3,0,False,0,0.022222,7,True,0.875000,3,Yes
2997,1589888339108585474,0,,@enginozkoc ARTIK YETER !\nSUÇ DUYURUSUDUR\n@R...,3,0,0,0,1589554549790052353,00can02,,6,0,False,0,0.042857,13,True,2.166667,6,Yes
2998,1587113229041508352,0,,@mahirunal Parti adına doğrusunu yaptın.,1,0,0,0,1491090669263278081,teomantiryakio2,,1,0,False,0,0.025000,0,False,0.000000,1,Yes


### 2.1.2. Separate X and y values
We only use 3 features here to create a baseline model. However, it is not enough to get good results.

In [172]:
X = dfPoliticalAll_train[['num_political_entities','total_interactions', 'num_retweets', 'num_mentions','retweeted_political','political_ratio','political_entities_men_ratio','num_political_ent_used']]
y = dfPoliticalAll_train['isPolitical'].apply(lambda x: 1 if x=='Yes' else 0)

### 2.1.3. Train - validation split

In [173]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

### 2.1.4. Train the model

In [174]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression


params = {
    'max_depth': [1, 3, 6, 8],
    'objective': ['binary:logistic'],
    'min_child_weight': [3,5,7,9,11],
    'learning_rate': [0.01,0.1,0.2],
    'subsample':[0.6,0.8,1],
    'colsample_bytree': [0.3,0.7,0.9],
    'gamma': [0,0.1,0.3,0.5]
}

xgb_model = xgb.XGBRegressor(seed=20)

grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=5, verbose=True)

grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(grid_search.best_params_)
print(grid_search.best_score_)

# Make predictions on the test set
preds = grid_search.predict(X_valid)

mse_political = mean_squared_error(y_valid, preds)

print("MSE:", mse_political)

importances = grid_search.best_estimator_.get_booster().get_score(importance_type='weight')
print(importances)


Fitting 5 folds for each of 2160 candidates, totalling 10800 fits
{'colsample_bytree': 0.9, 'gamma': 0.5, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 7, 'objective': 'binary:logistic', 'subsample': 1}
0.3310105864817057
MSE: 0.15565370352885136
{'political_ratio': 157, 'num_mentions': 70, 'num_retweets': 82, 'total_interactions': 112, 'num_political_ent_used': 38, 'num_political_entities': 34, 'political_entities_men_ratio': 84}


## 2.2. Bot Detection

### 2.2.1. Merge dfBotAll data with labels

In [175]:
dfBotAll.user_screen_name = dfBotAll.user_screen_name.str.lower()

In [176]:
dfBotAll_train = dfBotAll.merge(trainingUserDf,
                               left_on='user_screen_name',
                               right_on='screen_name')

dfBotAll_train

Unnamed: 0,user_id,user_name,user_screen_name,user_location,user_description,user_followers_count,user_friends_count,description_len,followers_to_all_ratio,user_default_photo,user_verified,retweet_total_ratio,num_median_favorites,num_of_retweets,screen_name,isBot
0,1512081815292432394,sezgin,sezgin953116371,,,46,430,0,0.096639,True,False,0.050251,0.0,10.0,sezgin953116371,No
1,1425452291428077571,Adem Koç,gogoadem61,,,14,171,0,0.075676,False,False,0.761062,0.0,86.0,gogoadem61,No
2,328164303,Necmettin Balıkçı,dewil511,,,21,49,0,0.300000,False,False,0.010101,0.0,2.0,dewil511,Yes
3,1343666971368431622,Night Bird⁷🦉,midnight__bird,,"La vie est un sommeil, l’amour en est le rêve...",422,260,48,0.618768,False,False,0.085000,1.0,17.0,midnight__bird,No
4,1240932880488038400,Samed Pınarcı,samedpinarci,,Orman Mühendisi - Orman İşletme Şefi - Orman G...,133,202,60,0.397015,False,False,0.780000,0.0,156.0,samedpinarci,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,1286770207134973954,Hamide Arabacı,anka6054,,,151,61,0,0.712264,False,False,0.000000,1.0,0.0,anka6054,No
2996,1598032338323214338,atamabekleyenbahceci,atamabekleyenzz,,,173,367,0,0.320370,False,False,0.580000,0.0,116.0,atamabekleyenzz,No
2997,760235343966863360,Emrah İNCİ,memrahinci,Istanbul - Bayburt,Researcher | Middle East | Political Science |...,5863,5905,71,0.498215,False,False,0.040000,36.0,8.0,memrahinci,No
2998,1553973684100124672,Murat Kkk,muratkkk18,,Normal sıradan bir insanım,1,10,26,0.090909,False,False,0.769231,0.0,10.0,muratkkk18,No


In [177]:
trainingUserDf.isBot.value_counts()

No     2424
Yes     576
Name: isBot, dtype: int64

### 2.2.2. Separate X and y values

In [178]:
X = dfBotAll_train[['description_len', 'followers_to_all_ratio', 'retweet_total_ratio', 'num_median_favorites','user_friends_count','user_followers_count','num_of_retweets']]
y = dfBotAll_train.isBot.apply(lambda x: 1 if x=='Yes' else 0)

### 2.2.3. Train-test split

In [179]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

### 2.2.4. Train the model

In [180]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error


params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.2],
    'objective': ['binary:logistic'],
    'gamma': [0,0.1,0.3,0.5],
    'min_child_weight': [3,5,7],
    'colsample_bytree': [0.6,0.8,1]
}

xgb_model = xgb.XGBRegressor()

grid_search_bot = GridSearchCV(estimator=xgb_model, param_grid=params, cv=5,verbose=True)

grid_search_bot.fit(X_train, y_train)

# Print the best parameters and the best score
print(grid_search_bot.best_params_)
print(grid_search_bot.best_score_)

# Make predictions on the test set
preds = grid_search_bot.predict(X_valid)

mse_bot = mean_squared_error(y_valid, preds)

print("MSE:", mse_bot)

importances = grid_search_bot.best_estimator_.get_booster().get_score(importance_type='weight')
print(importances)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
{'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 3, 'objective': 'binary:logistic'}
0.1190720517914066
MSE: 0.12287608098144233
{'num_median_favorites': 36, 'description_len': 65, 'followers_to_all_ratio': 142, 'user_friends_count': 120, 'num_of_retweets': 79, 'retweet_total_ratio': 48, 'user_followers_count': 116}


# 3. MAKE PREDICTIONS

## 3.1. Predictions for Tweets (Political or Not)

In [181]:
# read the evaluation file as follows
filename = 'evaluation-round{}-tweet.csv'.format(ROUND)
filepath = DATA_PATH + filename

evaluationTweetDf = pd.read_csv(filepath, dtype={0: str}, header=None, names=['tweet_id'])
evaluationTweetDf = evaluationTweetDf.dropna()
evaluationTweetDf

# merge it with the political dataframe so that you can use the make predictions based on the variables
dfPolitical_test = dfPolitical.merge(evaluationTweetDf)

# define X as we did above in section (2.x.2. Separate X and y values)
X = dfPolitical_test[['num_political_entities','total_interactions', 'num_retweets', 'num_mentions','retweeted_political','political_ratio','political_entities_men_ratio','num_political_ent_used']]

# make predictions based on these variables
predictions_political = grid_search.predict(X)

In [184]:
modelPredTweet = dict([(x,float(y)) for x,y in zip([*dfPolitical_test.tweet_id], predictions_political)])
modelPredTweet

{'1434787703783051264': 0.5054644346237183,
 '1367571642604544000': 0.41860708594322205,
 '1589993032975544320': 0.9516534209251404,
 '1565312596135354373': 0.6925728917121887,
 '1579558096833511424': 0.9203459024429321,
 '1439547067337256967': 0.32016319036483765,
 '1559963768372740098': 0.8117018342018127,
 '1562853131251118081': 0.5685722231864929,
 '1586021183958704128': 0.883188009262085,
 '1585766233491886081': 0.7936273217201233,
 '1427746815420604417': 0.4975329637527466,
 '1352635736537882629': 0.4959832429885864,
 '1415032260571680768': 0.5054644346237183,
 '1548636597628899328': 0.9325820207595825,
 '1564926450096013313': 0.7361527681350708,
 '1585634359612420101': 0.9299928545951843,
 '1597138789108895744': 0.8680359125137329,
 '1391681495622995971': 0.1275561898946762,
 '1389951943343316995': 0.31376713514328003,
 '1452348722810138646': 0.7959324717521667,
 '1595829502021623812': 0.7759685516357422,
 '1413108476348354562': 0.31376713514328003,
 '1579408398894137344': 0.817

## 3.2. Predictions for Users (Bot or Not)

In [185]:
filename = 'evaluation-round{}-user.csv'.format(ROUND)
filepath = DATA_PATH + filename
evaluationUserDf = pd.read_csv(filepath, dtype={0: str}, header=None, names=['user_screen_name'])

evaluationUserDf = evaluationUserDf.dropna()

# merge it with the political dataframe so that you can use the make predictions based on the variables
dfBot_test = dfBotAll.merge(evaluationUserDf)

# define X as we did above in section (2.x.2. Separate X and y values)
X = dfBot_test[['description_len', 'followers_to_all_ratio', 'retweet_total_ratio', 'num_median_favorites','user_friends_count','user_followers_count','num_of_retweets']]

# make predictions based on these variables
predictions_bot = grid_search_bot.predict(X)

In [187]:
modelPredUser = dict([(x,float(y)) for x,y in zip([*dfBot_test.user_screen_name], predictions_bot)])
modelPredUser

{'biologselim': 0.5139498710632324,
 'omerakdag34': 0.021547073498368263,
 'bilgin21604923': 0.20678631961345673,
 '_sydneycarton_': 0.12257526814937592,
 'denizlihabercom': 0.043855905532836914,
 'burakerbaychp': 0.018331466242671013,
 'mvnez': 0.18620462715625763,
 'qara118': 0.02080150879919529,
 'nabiyonyevrum': 0.06787581741809845,
 'farukhalit2': 0.04003332927823067,
 'harlunoshi': 0.09133040904998779,
 'heritagepaix': 0.24103787541389465,
 'nuranwolf': 0.013011902570724487,
 'politikgundem': 0.1426081657409668,
 'isakethudax': 0.01376291923224926,
 'enveraysevera': 0.020747967064380646,
 'ilaydejaneiro': 0.07683258503675461,
 '1905anason': 0.32619816064834595,
 'eraydurgut03': 0.07330784946680069,
 'dasiskein': 0.07015775889158249,
 'ercan_bas29': 0.08166806399822235,
 'mett_1907': 0.06921795755624771,
 'ondemir066': 0.32535311579704285,
 'semihyeteer': 0.09926706552505493,
 'haberinyokcokk': 0.2368965744972229,
 'meleky_ozaydin': 0.2628859281539917,
 'mehmetaltay64': 0.02268890

# Explanation of the Approach

In [188]:
# The approach

data_explanations = '''
How I handled data for training:

Firstly, I've extended the list of entities for the political entities even more than I did in the previous 2 rounds. 
I've added columns to the dfPolitical dataframe. Retweeted_political, num_political_entities_inDescription, political_ratio, num_user_mention_political, user_mention_political, political_entities_men_ratio, num_political_ent_used.
For the user I've gathered the user_verified data along with user_default_photo data. I've extracted these from user_metadata_line.

'''

feature_explanations = '''
What types of features I created:

For the political dataframe: 
retweeted_political checks if the user has retweeted the tweet of a politician or not.
num_political_entities_inDescription checks how many political entities is in the user's description. (if they have a description)
political_ratio checks the ratio between the political entities in a tweet and the tweet's total length.
num_user_mention_political checks how many political entities did the user "mention".
user_mention_political checks if the user mentioned any political entities or not.
political_entities_men_ratio checks the ratio between all the user's mentions and the political mentions.
num_political_ent_used checks how many political entities were used in the tweet. It's different from the one in the notebook since the one in the notebook only counts the distinct words. This function counts how many regardless of the political entities being distinct.

For the bot dataframe:
user_verified checks if the user is verified or not.
user_default_photo checks if the user has a default photo or not.
'''

model_explanations = '''
What did I try and used for modeling:

I've tried several models. Firstly what I did was trying out all the suggested models which were XGBoost, AdaBoost, RandomForest, Linear Regression and Logistic Regression. I've tried these models on the political tweet part.
The best models were xgboost and linear regression. XGBoost was around 0.19 where linear regression was around 0.2. I've went with XGBoost. I've modified its parameters, did gridsearchcv() on it, checked feature importances then dropped the columns which had less impact. I was able to drop the MSE down to 0.155 in the last round.
I also tried combining Linear Regression with XGBoost but it did not give me a better MSE score so I did not use it. I've used the same model for bot detection.
'''

