# 0. Initialize

## 0.1. Import Libraries

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os, sys, glob
import gzip
import random
import tqdm
import json
import numpy as np
import pandas as pd
import datetime
pd.set_option("display.max_columns", None)

from IPython import display
import matplotlib as mpl
from matplotlib import pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 0.2. DEFINE VARIABLES

In [None]:
DATA_PATH = '/content/drive/My Drive/' # '<insert-your-training-data-path-here>'

ROUND = 3 # This project will have 3 rounds of predictions: 1,2,3
STUDENT_ID = '28301' #'<insert-your-id-here>'
PROJECT_CODE = 'CS41223dfaf48ea2d'#'<insert-your-code-here>' # Same code for the annotation eg. CS412xxxxx

## 0.3. Read Training & Evaluation Data

### 0.3.1. Get the labels for tweets

In [None]:
#trainingTweetDf = pd.read_csv('{}training-tweet.csv'.format(DATA_PATH))
trainingTweetDf = pd.read_csv('{}training-tweet.csv'.format(DATA_PATH), dtype={'tweet_id': str, 'isPolitical': str})

trainingTweetDf

Unnamed: 0,tweet_id,isPolitical
0,1.58638E+18,No
1,1.58806E+18,Yes
2,1345142602794938372,No
3,1345305899636817920,No
4,1345415743417053190,No
...,...,...
3563,1598047568185167872,Yes
3564,1598049160938328089,Yes
3565,1598052366070484993,Yes
3566,1598057560292364288,Yes


In [None]:
trainingTweetDf.isPolitical.value_counts()

Yes    2363
No     1203
Name: isPolitical, dtype: int64

### 0.3.2. Get the labels for users

In [None]:
trainingUserDf = pd.read_csv('{}training-user.csv'.format(DATA_PATH))
#trainingUserDf = pd.read_csv('training-user.csv')
trainingUserDf

Unnamed: 0,screen_name,isBot
0,343734,Yes
1,41214727,No
2,73820407,Yes
3,_______duygu,Yes
4,_______iarzu__,No
...,...,...
3540,zumzumraa,No
3541,zvz12seco,Yes
3542,zwpert,No
3543,zzehra_gunes,No


In [None]:
trainingUserDf.isBot.value_counts()

No          2821
Yes          676
Not sure      47
Name: isBot, dtype: int64

### 0.3.3. Expand your dataset with metadata and tweets

In [None]:
# You can also expand training data by downloading your own labeled datasets following the link
# Download the documents under "Link to training data"

print('http://www.onurvarol.com/Annotation-CS412-202201/reports/report_{}.html'.format(PROJECT_CODE))

http://www.onurvarol.com/Annotation-CS412-202201/reports/report_CS41223dfaf48ea2d.html


# 1. EXTRACT FEATURES
Under *1.1. Political Tweet Detection* and *1.2. Bot Detection*, we firstly collect raw data for processing. We then combine some of them (total_interactions = num_favorites + num_retweets) or use them to extract features (whether the tweet has one of the political entities @meralaksener, @kilicdarogluk etc.).

We expect you to collect more raw data from **tweet_metadata**, **user_profiles** and **user_tweets** files by creating a function as shown in below examples such as *check_if_retweet()* and using it while iterating over data as shown under *Merge Collected Features*.

We also expect you to create new variables as much as you can from the data in order to make your predictions more accurate. For example, you may want to check:

- The tweet sources that a user frequently uses
- Whether the user is a verified account or not

...

to assess whether **a user is a bot or not** and whether **a tweet is political or not**.

In [None]:
PATH_TO_DOWNLOADED = DATA_PATH # 'D:/Users/suuser/Desktop/Sabancı/CS412/spring-2022/project/'

## 1.1. Political Tweet Detection
This part stands for the feature extraction of tweets. We start with collecting the raw data from *tweet_metadata*, then use some of them to extract features.

### 1.1.1. Get Raw Data

#### 1.1.1.1. Check if Retweet

In [None]:
def check_if_retweet(tweet_metadata_line):
    is_retweet = 0
    retweeted_username = None

    try:
        tweet_metadata_line['retweeted_status']
        retweeted_username = tweet_metadata_line['retweeted_status']['user']['screen_name'].lower()
        is_retweet = 1

    except KeyError:
        pass

    return is_retweet, retweeted_username

#### 1.1.1.2. Get Tweet Text

In [None]:
def get_tweet_text(tweet_metadata_line):
    text = tweet_metadata_line['text']

    return text

#### 1.1.1.3. Get Tweet ID

In [None]:
def get_tweet_id(tweet_metadata_line):
    id_str = tweet_metadata_line['id_str']

    return id_str

#### 1.1.1.4. Get Number of Mentions and Hashtags

In [None]:
def get_number_mentions_hashtags(tweet_metadata_line):
    num_mentions = len(tweet_metadata_line['entities']['user_mentions'])
    num_hashtags = len(tweet_metadata_line['entities']['hashtags'])

    return num_mentions, num_hashtags

#### 1.1.1.5. Get Number of Retweets and Favorites

In [None]:
def get_number_retweets_favorites(tweet_metadata_line):
    retweet_count = tweet_metadata_line['retweet_count']
    favorite_count = tweet_metadata_line['favorite_count']

    return retweet_count, favorite_count

#### 1.1.1.6. Get User Info

In [None]:
def get_user_info(tweet_metadata_line):
    id = tweet_metadata_line['user']['id_str']
    screen_name = tweet_metadata_line['user']['screen_name'].lower()
    description = tweet_metadata_line['user']['description']

    return id, screen_name, description

### 1.1.2. Derive Manually Crafted Features

#### 1.1.2.1. Check for political entity in text

In [None]:
def check_political_ent(text):

    # the list below can be modified and some new names may be added (or removed)
    list_of_entities = ['meral_aksener','Akşener','Kılıçdaroğlu','Muharrem İnce','Mevlüt Çavuşoğlu', 'kilicdarogluk', 'vekilince', 'RTErdogan', 'MevlutCavusoglu', 'umitozdag', "akp", "chp", "mhp", "hdp", "iyip", "iyi parti", "ak parti", "rte", "reis", "pkk", "altılı masa", "deva partisi", "saadet", "davutoğlu", "babacan", "erdoğan", "kılıçdar", "imamoğlu", "mansur yavaş", "akşener", "demirtaş", "suriyeliler", "seçim", "süleyman soylu", "bahçeli",
                        'fetö','hain','ekonomi','dolar','euro','bakan', "işsizlik","atama", "terör",'@suleymansoylu',
                         '@vedatbilgn',
 'Devlet Bahçeli',
 'Ak parti',
 '@tcbestepe',

 '@06melihgokcek', 'Melih Gökçek',
 '@akaraismailoglu',
 '@fuatoktay', 'Fuat Oktay',
 '@bybekirbozdag', 'Bekir Bozdağ',
 '@Mustafa_Destici', 'Mustafa Destici',
 '@celebimehmeta',

 '@alimahir',
 '@NureddinNebati', 'Nureddin Nebati',
 '@herkesicinCHP',
 '@eczozgurozel', 'Özgür Özel',
 '@enginozkoc', 'Engin Özkoç'
 '@TC_icisleri', 'İçişleri Bakanlığı',
 '@NumanKurtulmus', 'Numan Kurtulmuş',
 '@dfikrisaglar', 'Fikri Sağlar',
 '@fahrettinaltun', 'Fahrettin Altun',
 '@DpGultekinUysal', 'Gültekin Uysal',
 '@iyiparti', 'İyi Parti',
 '@Ahmet_Davutoglu', 'Ahmet Davutoğlu',
 '@alibabacan', 'Ali Babacan',
 '@DIBAliErbas',  'Ali Erbaş',
 '@omerrcelik', 'Ömer Çelik',
 '@vekilince',
 '@mahirunal', 'Mahir Ünal',
 '@akbasogluemin',
 '@murat_kurum', 'Murat Kurum',
 '@halktvcomtr', 'Halk Tv',
 '@saglikbakanligi', 'Sağlık Bakanlığı',
 '@veliagbaba', 'Veli Ağbaba',
 '@EmineErdogan', 'Emine Erdoğan',
 '@MHP_Bilgi',
 '@suleymansoylu:', 'Süleyman Soylu',
 '@avabdullahguler', 'Abdullah Güler',
 '@jsarieroglu',
 '@samiltayyar27',
 '@faikoztrak',
 '@ikalin1',
 '@cenginyurt52',
 '@bakisimsekmhp',
 '@sgirgin48tbmm',
 '@yavuzagiraliog',
 '@HDPgenelmerkezi',
 '@hdpdemirtas', 'Selahattin Demirtaş',
 '@emrullahisler',
 '@zaferpartisi', 'Zafer Partisi',
 '@OlcayKilavuz',
 '@adalet_bakanlik', 'Adalet Bakanlığı',
 '@tanjuozcanchp', 'Tanju Özcan',
 '@mvhuseyinyildiz',
 'Hüseyin Yıldız',
 '@tcbestepe:',
 '@TBMMresmi',
 '@MustafaSentop',
 'Mustafa Şentop',
 '@csbgovtr',
 '@bbpgenelmerkez',
 '@Canan_Kaftanci', 'Canan Kaftancıoğlu',
 '@serkantopalchp',
 '@erenerdemnet', 'Eren Erdem',
 '@aefakibaba',

 '@barisyarkadas',
 '@gergerliogluof',
 '@TC_icisleri…',
 '@ikoncuk',
 '@MhpTbmmGrubu',
 '@RumeysaKadak', 'Rümeysa Kadak',
 '@MzyenSEvkin',
 '@RTEdijital',
 '@kilicdarogluk…',
 '@meral_aksener…',
 '@BBahadirErdem',
 '@yeniakit',
 '@SezaiTemelli'
'@rterdogan',
'@kilicdarogluk',
'@meral',
'aksener',
'@suleymansoylu',
'@drfahrettinkoca',
'@umitozdag',
'@ekrem_imamoglu',
'imamoglu',
'bakanım',
'@akparti',
'başkanım',
'vekilim',

'@akaraismailoglu',
'@dbdevletbahceli',
'@nureddinnebati',
'erdoğan',
'kemal',
'@alimahir',

'@tcaytunciray',
'kpss',
'@eczozgurozel',
'@enginozkoc',
'cumhurbaşkanım',
'@alpayozalan35',
'@tcmeb',
'destici',
'@rterdogan…',
'@varank',
'adalet',
'bakan',
'başkan',
'@avozlemzengin',
'@herkesicinchp',
'cumhurbaşkanı',
'i̇mamoğlu',
'@faikoztrak',

'cumhurbaşkanımız',
'icisleri',
'kamu',
'parti',
'@arzuerdemdb',
'@halktvcomtr',
'@osmannnurika',
'@alibabacan',

'@lutfuturkkan',
'ekrem',
'@dfikrisaglar',
'@fahrettinaltun',
'@numankurtulmus',
'tayyip',
'@dibalierbas',
'@dpgultekinuysal',
'cumhuriyetimizin',
'davutoglu',
'@mansuryavas06',
'@omerrcelik',
'@akbasogluemin',
'@ayyildirim1',
'@fethigurer',
'@iyiparti',
'@vahitkirisci',
'kılıçdaroğlu',
'seçim',
'@deryayanikashb',
'@veliagbaba',
'bahçeli',
'mansur',
'putin',
'@mahirunal',
'@oznurcalik',
'@tanjuozcanchp',
'@vekilince',
'@yavuzagiraliog',
'başkanı',
'istihdam',
'muhalefet',
'soylu',
'vekil',
'@csgbakanligi',
'@emineerdogan',
'@emrullahisler',
'@mhp',
'terör',
'@bakisimsekmhp',
'@jsarieroglu',
'@nacicinisli',
'@olcaykilavuz',
'@samiltayyar27',
'@cenginyurt52',
'@hdpgenelmerkezi',
'@seferaycan',
'erbakan',
'meral',
'seçimde',
'şehit',
'@avabdullahguler',
'@belginuygur10',
'@cumhuriyetgzt',
'@drfahrettinkoca…',
'@kilicdarogluk…',
'@saglikbakanligi',
'bakanı',
'devletimize',
'devletin',
'erdoğanaffı',
'fetö',
'ihale',
'nato',
'reis',
'tarım',
'ulaştırma',
'@erenerdemnet',
'@erkanakcay45',
'@gergerliogluof',
'@mehmedmus',
'@mhptbmmgrubu',
'@mstanrikulu',
'@rtedijital',
'@sgirgin48tbmm',
'@suleymansoylu…',
'@tbmmresmi',
'@yildizfeti',
'anayasa',
'bakanlığı',
'cevdetyilmaz',
'devlete',
'icisleri…',
'tayyi̇p',
'ukrayna',
'@ayhanbilgen',
'@birgun',
'@hasandogan',
'@hdpdemirtas',
'@ikalin1',
'@muratemirchp',
'@oguzksalici',
'@serkantopalchp',
'@sezaitemelli',
'@suayipbirinci',
'@turkiskonf',
'@ulviyonter',
'@zaferpartisi',
'adayları',
'basın açıklaması',
'reisim',
'özdağ',
'@adalet',
'@avhamzadag',
'@avidrissahin',
'@bulenttufenkci',
'@cebrailyakr',
'@dpgultekinuysal…',
'@ednanarslanchp',
'@enginpnrbsli',
'@erginkahveci06',
'@fatmasahin',
'@gazetesozcu',
'@irfankaplanchp',
'@mevlutcavusoglu',
'@mustafasentop',
'@mvhuseyinyildiz',
'@mzyensevkin',
'@orhanssumer',
'@senolsunat',
'@tcsavunma',
'@tctarim',
'@vedatbilgn…',
'@yeniakit',
'@yenisafak',
'@ziyaselcuk',
'bakanlık',
'demokrasi',
'dolar',
'ekonomi',
'enflasyon',
'gençlere',
'hdpkk',
'iktidar',
'kılıçdaroğlunesöyledi',
'nebati',
'receptayyiperdogan',
'reisi',
'rei̇s',
'rusya',
'terörist',
'türkiyeyüzyılı',
'örgüt',
'@aefakibaba',
'@ahaber',
'@akgencmalatya',
'@akkadin',
'@akpartimalatya',
'@ankarakulisi06',
'@atuncayozkan',
'@avcahitozkan',
'@avserkanbayram',
'@bbahadirerdem',
'@bocekmuhittin',
'@bybekirbozdag…',
'@dbdevletbahceli…',
'@devapartisi',
'@drrecepakdag',
'@emniyetgm',
'@gurseltekin34',
'@hakiskonf',
'@mtanal',
'@myeneroglu',
'@nevsinmengu',
'@tbmmgenelkurulu…',
'@tcailesosyal'
]


    entities_in_text = [ent for ent in list_of_entities if ent.lower() in text.lower()]
    entities_exist = len(entities_in_text)

    return entities_exist

#### 1.1.2.2. Number of total interactions

In [None]:
def total_interactions(retweet_count, favorite_count):
    total_num_interactions = retweet_count + favorite_count

    return total_num_interactions

#### 1.1.2.3. Ratio of total interactions

In [None]:
def ratio_interactions(retweet_count, favorite_count):
    ratio_num_interactions = favorite_count / (retweet_count +1)

    return ratio_num_interactions

### 1.1.2. Collect data using the functions above and transform into a Pandas DataFrame

In [None]:
dfPolitical = {'tweet_id':[],
              'is_retweet':[],
              'retweeted_username':[],
              'text':[],
              'num_mentions':[],
              'num_hashtags':[],
              'num_retweets':[],
              'num_favorites':[],
              'user_id':[],
              'user_screen_name':[],
              'user_description':[],
              'political_entity_exist':[],
              'total_interactions':[],
              'ratio_interactions':[]}


with gzip.open(f"{PATH_TO_DOWNLOADED}tweet_metadata.jsons.gz", "rb") as f:
    for line in f:
        line = json.loads(line)



        # raw data:
        id_str = get_tweet_id(line)
        is_retweet, retweeted_username = check_if_retweet(line)
        text = get_tweet_text(line)
        num_mentions, num_hashtags = get_number_mentions_hashtags(line)
        retweet_count, favorite_count = get_number_retweets_favorites(line)
        user_id_str, screen_name, user_description = get_user_info(line)

        # manually crafted data:
        num_political_entities = check_political_ent(text)
        total_num_interactions = total_interactions(retweet_count, favorite_count)
        ratio_num_interactions = ratio_interactions(retweet_count, favorite_count)

        dfPolitical['tweet_id'].append(id_str)
        dfPolitical['is_retweet'].append(is_retweet)
        dfPolitical['retweeted_username'].append(retweeted_username)
        dfPolitical['text'].append(text)
        dfPolitical['num_mentions'].append(num_mentions)
        dfPolitical['num_hashtags'].append(num_hashtags)
        dfPolitical['num_retweets'].append(retweet_count)
        dfPolitical['num_favorites'].append(favorite_count)
        dfPolitical['user_id'].append(user_id_str)
        dfPolitical['user_screen_name'].append(screen_name)
        dfPolitical['user_description'].append(user_description)
        dfPolitical['political_entity_exist'].append(num_political_entities)
        dfPolitical['ratio_interactions'].append(ratio_num_interactions)
        dfPolitical['total_interactions'].append(total_num_interactions)


In [None]:
dfPolitical = pd.DataFrame(dfPolitical)
dfPolitical

Unnamed: 0,tweet_id,is_retweet,retweeted_username,text,num_mentions,num_hashtags,num_retweets,num_favorites,user_id,user_screen_name,user_description,political_entity_exist,total_interactions,ratio_interactions
0,1588568792984346624,0,,"Sosyal Hizmetin temelini çocuk oluşturur,çocuğ...",0,0,49,98,920963718103650304,maviruh_,shu/\nburaya afilli bir söz yazdığımı varsayın,0,147,1.96
1,1588452263047069697,0,,"@mahirunal Gavur İzmir ya onlar, hani Cumhuriy...",1,0,0,0,595514060,mtfdan,,2,0,0.00
2,1569589330544398336,0,,#ŞehitAdayıUzmÇvşaKadro\nSiz İstesenizde Istem...,0,1,0,0,1356375754561490947,ahsucilginuzman,Vatan Sevdalisi,1,0,0.00
3,1570428119609139201,0,,@ajans_muhbir Siz kaypak olmayıp onay vermesey...,1,0,0,0,1478775431008595968,hamitelkelle,HighOne,1,0,0.00
4,1551163840368414722,0,,Engelli öğretmenler olarak önümüzdeki engeller...,0,0,0,0,1511976696337113088,sed58417690,,0,0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33528,1568595408233832448,0,,Gerçek kimlik taşımayan hesaplara cevap vermem...,0,0,9,81,576247173,ardanzenturk,RT ONAYLADIĞIM ANLAMINA GELMEZ\nArtık fikirler...,0,90,8.10
33529,1584027427696959488,0,,@umitozdag Neden Suriyelilerle ilgili bu kadar...,1,0,1,8,162308585,ozgul_61,Bridge design engineer Yaay hesabı : dilfiruz,3,9,4.00
33530,1585945783307730945,0,,@celebimehmeta Niye Türkiye yüzyılıda.Türkiye ...,1,0,0,1,415025519,ladrekova,,1,1,1.00
33531,1569748909521801221,1,muazzezeralp,RT @muazzezeralp: @Doan58213655 @denizkonur @N...,7,1,6,0,1442125177727307781,yapikytgrivrlsn,,5,6,0.00


## 1.2. From Users

### 1.2.1. Get user metadata from user_profiles.jsons.gz

#### 1.2.1.1. Get user info metadata

In [None]:
def get_user_info_metadata(user_metadata_line):

    user_id = user_metadata_line['id_str']
    user_name = user_metadata_line['name']
    user_screen_name = user_metadata_line['screen_name'].lower()
    user_location = user_metadata_line['location']
    user_description = user_metadata_line['description']
    user_followers_count = user_metadata_line['followers_count']
    user_friends_count = user_metadata_line['friends_count']

    dictionary = {'user_id':user_id, 'user_name': user_name, 'user_screen_name':user_screen_name, 'user_location':user_location,
     'user_description':user_description, 'user_followers_count':user_followers_count, 'user_friends_count':user_friends_count}

    return dictionary

Creation Time

In [None]:
def get_user_time_metadata(user_metadata_line):
      user_created_at_year = (user_metadata_line['created_at'].split()[-1])
      user_created_at_month = (user_metadata_line['created_at'].split()[1])
      user_created_at_day = (user_metadata_line['created_at'].split()[2])
      month_dict={"Jan":1,"Feb":2,"Mar":3,"Apr":4,"May":5,"Jun":6,"Jul":7,"Aug":8,"Sep":9,"Oct":10,"Nov":11,"Dec":12}

      user_created_at = datetime.datetime(int(user_created_at_year), month_dict[user_created_at_month], int(user_created_at_day))


      return user_created_at

fav count

In [None]:
def get_favourites_count(user_metadata_line):
      favourites_count = user_metadata_line['favourites_count']

      return favourites_count

Tweet number

In [None]:
def get_user_tnumber_metadata(user_metadata_line):
      tweet_number = user_metadata_line['statuses_count']
      return tweet_number

Tweet number ratio with time

In [None]:
def get_user_tnumber_time_ratio(user_created_at, tweet_number):

      since = datetime.datetime(2022,12,31) - user_created_at
      tnumber_ratio = tweet_number/since.days
      return round(tnumber_ratio,3)

#### 1.2.1.2. Get followers/(followers+friends) ratio

In [None]:
def get_followers_all_ratio(user_followers_count, user_friends_count):

    if user_friends_count + user_followers_count == 0:
        followers_all_ratio = 0

    else:
        followers_all_ratio =  user_followers_count / (user_friends_count + user_followers_count)

    return round(followers_all_ratio,3)

Follower time ratio

In [None]:
def get_timeof_follows(user_created_at, user_followers_count):

  since = datetime.datetime(2022,12,31) - user_created_at

  time_ratio = user_followers_count/since.days
  return round(time_ratio,3)

like time ratio

In [None]:
def get_likes_time_ratio(user_created_at,favourites_count):
  since = datetime.datetime(2022,12,31) - user_created_at
  time_ratio = favourites_count/since.days
  return round(time_ratio,3)

#### 1.2.1.3. Get description length

In [None]:
def get_desc_len(user_description):

    description_len = len(user_description)

    return description_len

In [None]:

dfBot = {'user_id':[],
         'user_name':[],
         'user_screen_name':[],
         'user_location':[],
         'user_description':[],
         'user_followers_count':[],
         'user_friends_count':[],
         'description_len':[],
         'followers_to_all_ratio':[],
         'time_ratio':[],
         'tnumber_ratio':[],
         'favourites_count':[],
         'likes_time_ratio' :[],
         'tweet_number':[],
         'days_since_created':[]}

with gzip.open(f"{PATH_TO_DOWNLOADED}user_profiles.jsons.gz", "rb") as f:
    for line in f:
        line = json.loads(line)



        dictionary = get_user_info_metadata(line)
        for k,v in dictionary.items():
            dfBot[k].append(v)

        tweet_number=get_user_tnumber_metadata(line)

        user_created_at=get_user_time_metadata(line)
        since = datetime.datetime(2022,12,31) - user_created_at

        # manually crafted data:
        description_len = get_desc_len(dictionary['user_description'])
        dfBot['description_len'].append(description_len)

        followers_all_ratio = get_followers_all_ratio(dictionary['user_followers_count'],
                                                      dictionary['user_friends_count'])
        time_ratio=get_timeof_follows(user_created_at, dictionary['user_followers_count'])

        tnumber_ratio = get_user_tnumber_time_ratio(user_created_at, tweet_number)

        favourites_count = get_favourites_count(line)

        likes_time_ratio= get_likes_time_ratio(user_created_at,favourites_count)

        dfBot['followers_to_all_ratio'].append(followers_all_ratio)
        dfBot['time_ratio'].append(time_ratio)
        dfBot['tnumber_ratio'].append(tnumber_ratio)
        dfBot['favourites_count'].append(favourites_count)
        dfBot['likes_time_ratio'].append(likes_time_ratio)
        dfBot['tweet_number'].append(tweet_number)
        dfBot['days_since_created'].append(since.days)

In [None]:
i=0
with gzip.open(f"{PATH_TO_DOWNLOADED}user_profiles.jsons.gz", "rb") as f:
    for line in f:
        line = json.loads(line)
        print(line)




        i+=1
        if i==100:
          break

{'id': 1431241870848450577, 'id_str': '1431241870848450577', 'name': 'Nasreena Khan Wazir', 'screen_name': 'NasreenaKhan006', 'location': 'Islamabad, Pakistan', 'description': 'Student', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 65, 'friends_count': 185, 'listed_count': 0, 'created_at': 'Fri Aug 27 13:07:30 +0000 2021', 'favourites_count': 17676, 'utc_offset': None, 'time_zone': None, 'geo_enabled': True, 'verified': False, 'statuses_count': 2551, 'lang': None, 'status': {'created_at': 'Sun Dec 11 06:06:33 +0000 2022', 'id': 1601820715174748162, 'id_str': '1601820715174748162', 'text': '@pmln_org Lati b apni behns b insaf hi hoga', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'pmln_org', 'name': 'PML(N)', 'id': 497658491, 'id_str': '497658491', 'indices': [0, 9]}], 'urls': []}, 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', 'in_rep

In [None]:
dfBot = pd.DataFrame(dfBot)
dfBot

Unnamed: 0,user_id,user_name,user_screen_name,user_location,user_description,user_followers_count,user_friends_count,description_len,followers_to_all_ratio,time_ratio,tnumber_ratio,favourites_count,likes_time_ratio,tweet_number,days_since_created
0,1431241870848450577,Nasreena Khan Wazir,nasreenakhan006,"Islamabad, Pakistan",Student,65,185,7,0.260,0.132,5.196,17676,36.000,2551,491
1,1304340303080386560,fania :((((,scorpiehoez,bogor,have a holly jolly🎄,8235,3011,19,0.732,9.792,50.857,15474,18.400,42771,841
2,1116042038577958914,Yusuf Aksoy,yusufak63712920,,"Bir şeyden pişmanlık duymak istemiyorsan,her ş...",95,399,64,0.192,0.070,10.507,18220,13.387,14300,1361
3,4859899931,Be (VIXX6) ama oppalarının düğününe gidemiyor,nedenburdaysam,Hufflepuff ortak salon,"SMStan\n/St☆rlight ///come on girls,this is ou...",40,83,65,0.325,0.016,8.427,26999,10.680,21303,2528
4,2225373636,SLMDMR,biologselim,,BİYOLOG🔬🦠\nNanoteknoloji,100,98,23,0.505,0.030,0.491,2179,0.657,1629,3317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29665,1320834618220781569,islammm,islam_mrsj,,,229,217,0,0.513,0.288,3.010,10820,13.593,2396,796
29666,111074128,Melda Onur,meldaonur,,"Şekersiz çay, etsiz sofra, SAVAŞSIZ dünya... 🐌...",212457,5428,100,0.975,45.069,15.948,36671,7.779,75178,4714
29667,36946875,ali ydm,ali_ydm,"İstanbul, Türkiye",hayat oyunsa bende jeton çok,116,141,28,0.451,0.023,1.298,7389,1.480,6482,4992
29668,2389587396,Türkan Usta,turkanusta,"Ankara, Türkiye",Ustaya sormuşlar; hayatta yaptığın en büyük is...,1669,3639,116,0.314,0.519,37.683,140095,43.589,121113,3214


### 1.2.2. Get Tweet Info of Users in user_profiles.jsons.gz

Check number of hashtags in all tweets

In [None]:
def get_hash(line):
    number_hashtags = 0
    number_of_tweets = 0

    for tweet in line["tweets"]:
        if tweet !=[]:
          number_of_tweets += 1
        if tweet["entities"]["hashtags"] != []:
            number_hashtags += 1

    if number_of_tweets == 0:
      return 0
    else:
      return number_hashtags/number_of_tweets


In [None]:
def get_reply_to_tweet(line):
  number_replys=0
  number_of_tweets=0
  for tweet in line["tweets"]:
        if tweet !=[]:
          number_of_tweets += 1
        if tweet["in_reply_to_status_id"]!= None:
            number_replys += 1

  if number_of_tweets==0:
    return 0
  else:
    return round(number_replys/number_of_tweets,3)




#### 1.2.2.1. Check ratio of retweets to all tweets

In [None]:
def get_retweet_tweet_ratio(line):
    number_retweets = 0
    number_original_tweets = 0

    for tweet in line['tweets']:
        try:
            tweet['retweeted_status']
            number_retweets += 1

        except:
            number_original_tweets += 1

    total_tweets = number_retweets + number_original_tweets

    if total_tweets == 0:
        retweet_total_ratio = None
    else:
        retweet_total_ratio = round(number_retweets/(total_tweets),3)

    return retweet_total_ratio

#### 1.2.2.2. Check median number of favorites

In [None]:
def get_median_number_favorites(line):
    num_median_favorites = np.median([tweet['favorite_count'] for tweet in line['tweets']])

    return num_median_favorites

### 1.2.3. Collect data using the functions above and transform into a Pandas DataFrame

In [None]:
dfBotTweets = {'user_id':[],
               'retweet_total_ratio':[],
               'num_median_favorites':[],
               'num_of_tweets':[],
               'hashtag_tweets_to_total_tweet_ratio':[],
               'reply_to_tweet_ratio':[]
              }

i = 0

with gzip.open(f"{PATH_TO_DOWNLOADED}user_tweets.jsons.gz", "rb") as f:
    first_line = f.readline()

    for line in f:

        line = json.loads(line)


        user_id = line['user_id']
        dfBotTweets['user_id'].append(user_id)

        retweet_total_ratio = get_retweet_tweet_ratio(line)
        dfBotTweets['retweet_total_ratio'].append(retweet_total_ratio)

        hashtag_ratio = get_hash(line)
        dfBotTweets['hashtag_tweets_to_total_tweet_ratio'].append(hashtag_ratio)

        reply_ratio = get_reply_to_tweet(line)
        dfBotTweets['reply_to_tweet_ratio'].append(reply_ratio)


        num_median_favorites = get_median_number_favorites(line)
        dfBotTweets['num_median_favorites'].append(num_median_favorites)

        dfBotTweets['num_of_tweets'].append(len(line['tweets']))

        i += 1
        if i % 1000 == 0:
            print(i)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000


In [None]:
i=0
with gzip.open(f"{PATH_TO_DOWNLOADED}user_tweets.jsons.gz", "rb") as f:
    first_line = f.readline()

    for line in f:

        line = json.loads(line)
        reply_ratio = get_reply_to_tweet(line)

        print(reply_ratio)
        i+=1
        if i==100:
          break

0.769
0.025
0.72
0.0
0.635
0.02
0.08
0.955
0.351
0.03
0.559
0.063
0.147
0.123
0.96
0.445
0.857
0.085
0.035
0.445
0.0
0.635
0.34
0.0
0.0
0.02
0.0
0.533
0.79
0.4
0.065
0.65
0.0
0.98
0.354
0.249
0.055
0.27
0.0
0.345
0.0
0.077
0.558
0.533
0.08
0.985
0.89
0.241
0.92
0.035
0.735
0.759
0.47
0.165
0.01
0.25
1.0
0.15
0.125
0.855
0.91
0.0
0.397
0.935
0.99
0.89
0.005
0.924
0.032
0.67
0.145
0.39
0.055
0.387
0.036
0.729
0.264
0.497
0.096
0.082
0.246
0.335
0.9
0.253
0.473
0.041
0.535
0.99
0.27
0.045
0.408
0
0.005
0.286
0.508
0.0
0.395
0.985
0.43
0.175


In [None]:
dfBotTweets = pd.DataFrame(dfBotTweets)
dfBotTweets

Unnamed: 0,user_id,retweet_total_ratio,num_median_favorites,num_of_tweets,hashtag_tweets_to_total_tweet_ratio,reply_to_tweet_ratio
0,525600289,0.005,1.0,199,0.085427,0.769
1,931895965501534209,0.900,0.0,200,0.710000,0.025
2,1591543462746329088,0.185,0.0,200,0.060000,0.720
3,734801354749796352,1.000,0.0,200,0.075000,0.000
4,1384697395439706113,0.045,0.0,200,0.030000,0.635
...,...,...,...,...,...,...
28309,1591370361488252928,0.800,0.0,200,0.420000,0.120
28310,1475272459616235525,0.825,0.0,200,0.120000,0.155
28311,1096753792731750401,0.051,1.0,196,0.015306,0.622
28312,1269527617687953409,0.095,2.0,200,0.005000,0.155


### 1.2.3. Merge dfBot and dfBotTweets

In [None]:
dfBotAll = dfBot.merge(dfBotTweets,
                       how='left')

dfBotAll[['retweet_total_ratio', 'num_median_favorites']] = dfBotAll[['retweet_total_ratio', 'num_median_favorites']].fillna(0)

dfBotAll

Unnamed: 0,user_id,user_name,user_screen_name,user_location,user_description,user_followers_count,user_friends_count,description_len,followers_to_all_ratio,time_ratio,tnumber_ratio,favourites_count,likes_time_ratio,tweet_number,days_since_created,retweet_total_ratio,num_median_favorites,num_of_tweets,hashtag_tweets_to_total_tweet_ratio,reply_to_tweet_ratio
0,1431241870848450577,Nasreena Khan Wazir,nasreenakhan006,"Islamabad, Pakistan",Student,65,185,7,0.260,0.132,5.196,17676,36.000,2551,491,0.396,0.0,197.0,0.076142,0.599
1,1304340303080386560,fania :((((,scorpiehoez,bogor,have a holly jolly🎄,8235,3011,19,0.732,9.792,50.857,15474,18.400,42771,841,0.125,0.0,200.0,0.000000,0.635
2,1116042038577958914,Yusuf Aksoy,yusufak63712920,,"Bir şeyden pişmanlık duymak istemiyorsan,her ş...",95,399,64,0.192,0.070,10.507,18220,13.387,14300,1361,0.910,0.0,200.0,0.710000,0.005
3,4859899931,Be (VIXX6) ama oppalarının düğününe gidemiyor,nedenburdaysam,Hufflepuff ortak salon,"SMStan\n/St☆rlight ///come on girls,this is ou...",40,83,65,0.325,0.016,8.427,26999,10.680,21303,2528,0.015,1.0,196.0,0.005102,0.332
4,2225373636,SLMDMR,biologselim,,BİYOLOG🔬🦠\nNanoteknoloji,100,98,23,0.505,0.030,0.491,2179,0.657,1629,3317,0.660,0.0,197.0,0.390863,0.102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29665,1320834618220781569,islammm,islam_mrsj,,,229,217,0,0.513,0.288,3.010,10820,13.593,2396,796,0.015,1.0,200.0,0.010000,0.940
29666,111074128,Melda Onur,meldaonur,,"Şekersiz çay, etsiz sofra, SAVAŞSIZ dünya... 🐌...",212457,5428,100,0.975,45.069,15.948,36671,7.779,75178,4714,0.291,2.0,199.0,0.075377,0.312
29667,36946875,ali ydm,ali_ydm,"İstanbul, Türkiye",hayat oyunsa bende jeton çok,116,141,28,0.451,0.023,1.298,7389,1.480,6482,4992,0.062,0.0,195.0,0.025641,0.938
29668,2389587396,Türkan Usta,turkanusta,"Ankara, Türkiye",Ustaya sormuşlar; hayatta yaptığın en büyük is...,1669,3639,116,0.314,0.519,37.683,140095,43.589,121113,3214,0.995,0.0,200.0,0.035000,0.005


# 2. TRAIN MODEL

## 2.1. Political Tweet Prediction

In [None]:
from xgboost import XGBClassifier
import xgboost as xg



### 2.1.1. Merge dfPolitical data with labels

In [None]:
dfPoliticalAll_train = dfPolitical.merge(trainingTweetDf,
                                         on='tweet_id')

dfPoliticalAll_train.head()

Unnamed: 0,tweet_id,is_retweet,retweeted_username,text,num_mentions,num_hashtags,num_retweets,num_favorites,user_id,user_screen_name,user_description,political_entity_exist,total_interactions,ratio_interactions,isPolitical
0,1588568792984346624,0,,"Sosyal Hizmetin temelini çocuk oluşturur,çocuğ...",0,0,49,98,920963718103650304,maviruh_,shu/\nburaya afilli bir söz yazdığımı varsayın,0,147,1.96,Yes
1,1588452263047069697,0,,"@mahirunal Gavur İzmir ya onlar, hani Cumhuriy...",1,0,0,0,595514060,mtfdan,,2,0,0.0,Yes
2,1569589330544398336,0,,#ŞehitAdayıUzmÇvşaKadro\nSiz İstesenizde Istem...,0,1,0,0,1356375754561490947,ahsucilginuzman,Vatan Sevdalisi,1,0,0.0,Yes
3,1570428119609139201,0,,@ajans_muhbir Siz kaypak olmayıp onay vermesey...,1,0,0,0,1478775431008595968,hamitelkelle,HighOne,1,0,0.0,Yes
4,1551163840368414722,0,,Engelli öğretmenler olarak önümüzdeki engeller...,0,0,0,0,1511976696337113088,sed58417690,,0,0,0.0,Yes


### 2.1.2. Separate X and y values
We only use 3 features here to create a baseline model. However, it is not enough to get good results.

In [None]:
X = dfPoliticalAll_train[['political_entity_exist','total_interactions','num_hashtags', "ratio_interactions"]]
y = dfPoliticalAll_train['isPolitical'].apply(lambda x: 1 if x=='Yes' else 0)

### 2.1.3. Train - validation split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

### 2.1.4. Train the model

Here, you may use different models such as neural networks, XGBoost, AdaBoost, RandomForest, Linear Regression, Logistic Regression etc. to see which model does the best. Also, you can use grid_search_cv() or a basic for loop to optimize the hyperparameters of your model.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

# create an instance
dtc_political = xg.XGBRegressor(objective ='reg:logistic')

# fit your model
dtc_political.fit(X_train, y_train)



# make predictions
preds = dtc_political.predict(X_valid)

# evaluate on validation set
mse = mean_squared_error(y_valid, preds)

print("MSE:", mse, "\n")

MSE: 0.13601198614474375 



## 2.2. Bot Detection

### 2.2.1. Merge dfBotAll data with labels

In [None]:
dfBotAll.user_screen_name = dfBotAll.user_screen_name.str.lower()

In [None]:
dfBotAll_train = dfBotAll.merge(trainingUserDf,
                               left_on='user_screen_name',
                               right_on='screen_name')

dfBotAll_train

Unnamed: 0,user_id,user_name,user_screen_name,user_location,user_description,user_followers_count,user_friends_count,description_len,followers_to_all_ratio,time_ratio,tnumber_ratio,favourites_count,likes_time_ratio,tweet_number,days_since_created,retweet_total_ratio,num_median_favorites,num_of_tweets,hashtag_tweets_to_total_tweet_ratio,reply_to_tweet_ratio,screen_name,isBot
0,1431241870848450577,Nasreena Khan Wazir,nasreenakhan006,"Islamabad, Pakistan",Student,65,185,7,0.260,0.132,5.196,17676,36.000,2551,491,0.396,0.0,197.0,0.076142,0.599,nasreenakhan006,No
1,1304340303080386560,fania :((((,scorpiehoez,bogor,have a holly jolly🎄,8235,3011,19,0.732,9.792,50.857,15474,18.400,42771,841,0.125,0.0,200.0,0.000000,0.635,scorpiehoez,No
2,1116042038577958914,Yusuf Aksoy,yusufak63712920,,"Bir şeyden pişmanlık duymak istemiyorsan,her ş...",95,399,64,0.192,0.070,10.507,18220,13.387,14300,1361,0.910,0.0,200.0,0.710000,0.005,yusufak63712920,No
3,4859899931,Be (VIXX6) ama oppalarının düğününe gidemiyor,nedenburdaysam,Hufflepuff ortak salon,"SMStan\n/St☆rlight ///come on girls,this is ou...",40,83,65,0.325,0.016,8.427,26999,10.680,21303,2528,0.015,1.0,196.0,0.005102,0.332,nedenburdaysam,No
4,2225373636,SLMDMR,biologselim,,BİYOLOG🔬🦠\nNanoteknoloji,100,98,23,0.505,0.030,0.491,2179,0.657,1629,3317,0.660,0.0,197.0,0.390863,0.102,biologselim,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3540,1286770207134973954,Hamide Arabacı,anka6054,,,151,61,0,0.712,0.170,5.942,5347,6.008,5288,890,0.000,1.0,200.0,0.005000,0.950,anka6054,No
3541,1598032338323214338,atamabekleyenbahceci,atamabekleyenzz,,,173,367,0,0.320,5.581,10.032,196,6.323,311,31,0.580,0.0,200.0,0.105000,0.420,atamabekleyenzz,No
3542,760235343966863360,Emrah İNCİ,memrahinci,Istanbul - Bayburt,Researcher | Middle East | Political Science |...,5863,5905,71,0.498,2.502,0.439,725,0.309,1029,2343,0.040,36.0,200.0,0.050000,0.115,memrahinci,No
3543,1553973684100124672,Murat Kkk,muratkkk18,,Normal sıradan bir insanım,1,10,26,0.091,0.007,0.118,38,0.250,18,152,0.769,0.0,13.0,0.307692,0.231,muratkkk18,No


In [None]:
trainingUserDf.isBot.value_counts()

No          2821
Yes          676
Not sure      47
Name: isBot, dtype: int64

In [None]:
dfPoliticalAll_train.tweet_id.value_counts()

1588568792984346624    1
1590230637637574656    1
1584205518109040640    1
1357610094696034305    1
1373245913041362944    1
                      ..
1393931156270292997    1
1594906721197199360    1
1597270416846098432    1
1598046905443192832    1
1596906607971307522    1
Name: tweet_id, Length: 3566, dtype: int64

### 2.2.2. Separate X and y values
We use only 4 features here to create a baseline model. However, it is not enough to get good results.

In [None]:
X = dfBotAll_train[['followers_to_all_ratio', 'time_ratio', 'tnumber_ratio','hashtag_tweets_to_total_tweet_ratio','favourites_count','likes_time_ratio','tweet_number','user_friends_count','reply_to_tweet_ratio','days_since_created' ]]
y = dfBotAll_train.isBot.apply(lambda x: 1 if x=='Yes' else 0)

### 2.2.3. Train-test split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=123)

In [None]:
X_train['hashtag_tweets_to_total_tweet_ratio'].fillna(int(X_train['hashtag_tweets_to_total_tweet_ratio'].mean()), inplace=True)
missing_values_count = X_train.isna().sum()
print(missing_values_count)

followers_to_all_ratio                  0
time_ratio                              0
tnumber_ratio                           0
hashtag_tweets_to_total_tweet_ratio     0
favourites_count                        0
likes_time_ratio                        0
tweet_number                            0
user_friends_count                      0
reply_to_tweet_ratio                   84
days_since_created                      0
dtype: int64


In [None]:
X_train['reply_to_tweet_ratio'].fillna(int(X_train['reply_to_tweet_ratio'].mean()), inplace=True)
missing_values_count = X_train.isna().sum()
print(missing_values_count)

followers_to_all_ratio                 0
time_ratio                             0
tnumber_ratio                          0
hashtag_tweets_to_total_tweet_ratio    0
favourites_count                       0
likes_time_ratio                       0
tweet_number                           0
user_friends_count                     0
reply_to_tweet_ratio                   0
days_since_created                     0
dtype: int64


In [None]:
X_valid['hashtag_tweets_to_total_tweet_ratio'].fillna(int(X_valid['hashtag_tweets_to_total_tweet_ratio'].mean()), inplace=True)
missing_values_count = X_valid.isna().sum()
print(missing_values_count)

followers_to_all_ratio                  0
time_ratio                              0
tnumber_ratio                           0
hashtag_tweets_to_total_tweet_ratio     0
favourites_count                        0
likes_time_ratio                        0
tweet_number                            0
user_friends_count                      0
reply_to_tweet_ratio                   17
days_since_created                      0
dtype: int64


In [None]:
X_valid['reply_to_tweet_ratio'].fillna(int(X_valid['reply_to_tweet_ratio'].mean()), inplace=True)
missing_values_count = X_valid.isna().sum()
print(missing_values_count)

followers_to_all_ratio                 0
time_ratio                             0
tnumber_ratio                          0
hashtag_tweets_to_total_tweet_ratio    0
favourites_count                       0
likes_time_ratio                       0
tweet_number                           0
user_friends_count                     0
reply_to_tweet_ratio                   0
days_since_created                     0
dtype: int64


In [None]:
X_train.user_friends_count.value_counts()

0        31
1        12
30       10
2        10
109      10
         ..
1087      1
236       1
21658     1
907       1
15604     1
Name: user_friends_count, Length: 1644, dtype: int64

### 2.2.4. Train the model

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
import xgboost as xg


# create an instance
dtc_bot =  xg.XGBRegressor(objective ='reg:logistic')

# fit your model
dtc_bot.fit(X_train, y_train)

# make predictions
preds = dtc_bot.predict(X_valid)

# evaluate on validation set
mse = mean_squared_error(y_valid, preds)


print("MSE:", mse, "\n")

MSE: 0.11898865256894928 



# 3. MAKE PREDICTIONS

Here, you will make predictions with the models that you have trained above.

## 3.1. Predictions for Tweets (Political or Not)

In [None]:
# read the evaluation file as follows
evaluationTweetDf = pd.read_csv('{}evaluation-round3-tweet.csv'.format(DATA_PATH), dtype={0: str}, header=None, names=['tweet_id'])
evaluationTweetDf = evaluationTweetDf.dropna()
evaluationTweetDf

# merge it with the political dataframe so that you can use the make predictions based on the variables
dfPolitical_test = dfPolitical.merge(evaluationTweetDf)

# define X as we did above in section (2.x.2. Separate X and y values)
X = dfPolitical_test[['political_entity_exist','total_interactions','num_hashtags','ratio_interactions']]


# make predictions based on these variables
predictions_political = dtc_political.predict(X)
predictions_political

array([0.29409143, 0.29409143, 0.8837473 , ..., 0.29409143, 0.15945   ,
       0.8630889 ], dtype=float32)

### This part is important! We expect you to return your predictions in the following format:

In [None]:
modelPredTweet = dict([(x,float(y)) for x,y in zip([*dfPolitical_test.tweet_id], predictions_political)])
modelPredTweet

{'1434787703783051264': 0.29409143328666687,
 '1367571642604544000': 0.29409143328666687,
 '1589993032975544320': 0.8837472796440125,
 '1565312596135354373': 0.8821874856948853,
 '1579558096833511424': 0.8612817525863647,
 '1439547067337256967': 0.13519854843616486,
 '1559963768372740098': 0.6954173445701599,
 '1562853131251118081': 0.35775336623191833,
 '1586021183958704128': 0.846939742565155,
 '1585766233491886081': 0.846939742565155,
 '1427746815420604417': 0.08846081793308258,
 '1352635736537882629': 0.2523629665374756,
 '1415032260571680768': 0.29409143328666687,
 '1548636597628899328': 0.9646464586257935,
 '1564926450096013313': 0.29409143328666687,
 '1585634359612420101': 0.879113495349884,
 '1597138789108895744': 0.8106664419174194,
 '1391681495622995971': 0.1594499945640564,
 '1389951943343316995': 0.1594499945640564,
 '1452348722810138646': 0.846939742565155,
 '1595829502021623812': 0.879113495349884,
 '1413108476348354562': 0.1594499945640564,
 '1579408398894137344': 0.8791

## 3.2. Predictions for Users (Bot or Not)

In [None]:
evaluationUserDf = pd.read_csv('{}evaluation-round3-user.csv'.format(DATA_PATH), dtype={0: str}, header=None, names=['user_screen_name'])
evaluationUserDf = evaluationUserDf.dropna()

# merge it with the political dataframe so that you can use the make predictions based on the variables
dfBot_test = dfBotAll.merge(evaluationUserDf)

# define X as we did above in section (2.x.2. Separate X and y values)
X = dfBot_test[['followers_to_all_ratio', 'time_ratio', 'tnumber_ratio','hashtag_tweets_to_total_tweet_ratio','favourites_count','likes_time_ratio','tweet_number','user_friends_count','reply_to_tweet_ratio','days_since_created']]

# make predictions based on these variables
predictions_bot = dtc_bot.predict(X)
predictions_bot

array([0.18177284, 0.11526518, 0.14281555, ..., 0.6587381 , 0.15952553,
       0.10854047], dtype=float32)

In [None]:
modelPredUser = dict([(x,float(y)) for x,y in zip([*dfBot_test.user_screen_name], predictions_bot)])
modelPredUser

{'biologselim': 0.181772843003273,
 'omerakdag34': 0.11526518315076828,
 'bilgin21604923': 0.14281554520130157,
 '_sydneycarton_': 0.12511573731899261,
 'denizlihabercom': 0.03117663599550724,
 'burakerbaychp': 0.014246405102312565,
 'mvnez': 0.17574849724769592,
 'qara118': 0.02446994185447693,
 'nabiyonyevrum': 0.07864747941493988,
 'farukhalit2': 0.1500256210565567,
 'harlunoshi': 0.12990263104438782,
 'heritagepaix': 0.040980949997901917,
 'nuranwolf': 0.014569352380931377,
 'politikgundem': 0.16520415246486664,
 'isakethudax': 0.020506303757429123,
 'enveraysevera': 0.02026067115366459,
 'ilaydejaneiro': 0.05595738813281059,
 '1905anason': 0.11707920581102371,
 'eraydurgut03': 0.03287598863244057,
 'dasiskein': 0.016310622915625572,
 'ercan_bas29': 0.19096216559410095,
 'mett_1907': 0.07060025632381439,
 'ondemir066': 0.288661390542984,
 'semihyeteer': 0.03850675746798515,
 'haberinyokcokk': 0.11987173557281494,
 'meleky_ozaydin': 0.07125124335289001,
 'mehmetaltay64': 0.025357555

# PREPARE SUBMISSION

You will need to submit exact same file produced by using the following code. Any deviation from the desired format willbe marked as 0.

In [None]:
# Explain your approach

data_explanations = '''
I took the data of if it is a political tweet or if the user is a social bot or not from the annotated data of ours and the global data that was given to us. And from
the .json files I got the data of the characteristics of these train and test data.
'''

feature_explanations = '''
For the detection of political tweets I used the features of 'political_entity_exist','total_interactions','num_hashtags','ratio_interactions' whose characteristics can be understood by their names.
For the detection of social bots I used the features of 'followers_to_all_ratio', 'time_ratio', 'tnumber_ratio','hashtag_tweets_to_total_tweet_ratio','favourites_count','likes_time_ratio','tweet_number','user_friends_count','reply_to_tweet_ratio','days_since_created'. Time ratio is the follower number dived by the time passed since they created their account. And the "tnumber_ratio" is the ratio of number of tweets and time passed since creation of account.
'''

model_explanations = '''
I've used XGBoost model. For political tweets and social bots I used logistic regression.
'''

additional_explanations = '''
'''

In [None]:
predictions = {
    'round': ROUND,
    'student_id': STUDENT_ID,
    'user_predictions': modelPredUser,
    'tweet_predictions': modelPredTweet,
    'explanations': {
        'data': data_explanations,
        'feature': feature_explanations,
        'model': model_explanations,
        'other': additional_explanations,
    }
}


with open('predictions-{}_round{}.json'.format(STUDENT_ID, ROUND), 'w') as fl:
    fl.write(json.dumps(predictions, indent=4))

In [None]:
# Test your submission file

submission = json.load(open('predictions-{}_round{}.json'.format(STUDENT_ID, ROUND), 'r'))
submission

{'round': 3,
 'student_id': '28301',
 'user_predictions': {'biologselim': 0.181772843003273,
  'omerakdag34': 0.11526518315076828,
  'bilgin21604923': 0.14281554520130157,
  '_sydneycarton_': 0.12511573731899261,
  'denizlihabercom': 0.03117663599550724,
  'burakerbaychp': 0.014246405102312565,
  'mvnez': 0.17574849724769592,
  'qara118': 0.02446994185447693,
  'nabiyonyevrum': 0.07864747941493988,
  'farukhalit2': 0.1500256210565567,
  'harlunoshi': 0.12990263104438782,
  'heritagepaix': 0.040980949997901917,
  'nuranwolf': 0.014569352380931377,
  'politikgundem': 0.16520415246486664,
  'isakethudax': 0.020506303757429123,
  'enveraysevera': 0.02026067115366459,
  'ilaydejaneiro': 0.05595738813281059,
  '1905anason': 0.11707920581102371,
  'eraydurgut03': 0.03287598863244057,
  'dasiskein': 0.016310622915625572,
  'ercan_bas29': 0.19096216559410095,
  'mett_1907': 0.07060025632381439,
  'ondemir066': 0.288661390542984,
  'semihyeteer': 0.03850675746798515,
  'haberinyokcokk': 0.119871