# 0. Initialize

## 0.1. Import Libraries

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os, sys, glob
import gzip
import random
import tqdm
import json
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)

from IPython import display
import matplotlib as mpl
from matplotlib import pyplot as plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 0.2. DEFINE VARIABLES 

In [None]:
DATA_PATH = '/content/drive/MyDrive/cs412/' # '<insert-your-training-data-path-here>'
%ls

ROUND = 1 # This project will have 3 rounds of predictions: 1,2,3
STUDENT_ID = '22534'#'<insert-your-id-here>'
PROJECT_CODE = 'CS412cb12d847a4ee'#'<insert-your-code-here>' # Same code for the annotation eg. CS412xxxxx

[0m[01;34msample_data[0m/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 0.3. Read Training & Evaluation Data

### 0.3.1. Get the labels for tweets

In [None]:
# Given training
trainingTweetDf = pd.read_csv('{}training-tweet.csv'.format(DATA_PATH), dtype={'tweet_id': str, 'isPolitical': str})

# My annotations
my_training_tweet = pd.read_csv(('{}annotated_tweets_'+PROJECT_CODE+'.csv').format(DATA_PATH), dtype={'Unnamed: 0': str, 'isPolitical': str})

my_training_tweet.drop(['url', 'sentiment', 'isExperiential', 'isInsult'], 1, inplace=True)
my_training_tweet.rename(columns = {'Unnamed: 0':'tweet_id'}, inplace=True)

# Concatenating -> droping duplicates -> reindexing
trainingTweetDf = pd.concat([my_training_tweet, trainingTweetDf], axis='rows')
trainingTweetDf.drop_duplicates(subset=['tweet_id'], inplace=True)
trainingTweetDf.reset_index(drop=True, inplace=True)

trainingTweetDf

  my_training_tweet.drop(['url', 'sentiment', 'isExperiential', 'isInsult'], 1, inplace=True)


Unnamed: 0,tweet_id,isPolitical,topics
0,1359541801925939211,No,"['Economy', 'Inflation&poverty', 'Unemployment']"
1,1585634359612420101,Yes,"['Democracy', 'Election', 'Crime&justice', 'Na..."
2,1590380120778285057,Yes,"['Democracy', 'Election', 'Crime&justice']"
3,1597512443328167936,Yes,"['Democracy', 'Crime&justice', 'Immigration&re..."
4,1351645835495354372,No,['Music&TV']
...,...,...,...
3165,1593539327623151619,Yes,
3166,1393886554062524418,No,
3167,1597925615092764672,Yes,
3168,1585291418616176640,Yes,


In [None]:
trainingTweetDf.isPolitical.value_counts()

Yes    2113
No     1045
Name: isPolitical, dtype: int64

### 0.3.2. Get the labels for users

In [None]:
# Given training
trainingUserDf = pd.read_csv('{}training-user.csv'.format(DATA_PATH), dtype={'screen_name': str, 'isBot': str})

# My annotations
my_training_user = pd.read_csv(('{}annotated_users_'+PROJECT_CODE+'.csv').format(DATA_PATH), dtype={'Unnamed: 0': str, 'isBot': str})

my_training_user.drop(['url', 'isOrganizational', 'isTroll'], 1, inplace=True)
my_training_user.rename(columns = {'Unnamed: 0':'screen_name'}, inplace=True)

# Concatenating -> droping duplicates -> reindexing
trainingUserDf = pd.concat([my_training_user, trainingUserDf], axis='rows')
trainingUserDf.drop_duplicates(subset=['screen_name'], inplace=True)
trainingUserDf.reset_index(drop=True, inplace=True)

trainingUserDf

  my_training_user.drop(['url', 'isOrganizational', 'isTroll'], 1, inplace=True)


Unnamed: 0,screen_name,isBot,gender
0,hseyin278731983,Yes,Male
1,whisperhaber,No,Not sure
2,vedat74948368,No,Male
3,d_carpe_diem_,No,Female
4,tcsametguney,Yes,Male
...,...,...,...
3195,djblumenberg,No,
3196,mel1sq,No,
3197,eren_yz1,Yes,
3198,ergnyildiz4,No,


In [None]:
trainingUserDf.isBot.value_counts()

No          2523
Yes          672
Not sure       5
Name: isBot, dtype: int64

### 0.3.3. Expand your dataset with metadata and tweets

In [None]:
# You can also expand training data by downloading your own labeled datasets following the link
# Download the documents under "Link to training data"

print('http://www.onurvarol.com/Annotation-CS412-202201/reports/report_{}.html'.format(PROJECT_CODE))

http://www.onurvarol.com/Annotation-CS412-202201/reports/report_CS412cb12d847a4ee.html


# 1. EXTRACT FEATURES
Under *1.1. Political Tweet Detection* and *1.2. Bot Detection*, we firstly collect raw data for processing. We then combine some of them (total_interactions = num_favorites + num_retweets) or use them to extract features (whether the tweet has one of the political entities @meralaksener, @kilicdarogluk etc.).

We expect you to collect more raw data from **tweet_metadata**, **user_profiles** and **user_tweets** files by creating a function as shown in below examples such as *check_if_retweet()* and using it while iterating over data as shown under *Merge Collected Features*.

We also expect you to create new variables as much as you can from the data in order to make your predictions more accurate. For example, you may want to check:

- The tweet sources that a user frequently uses
- Whether the user is a verified account or not

...

to assess whether **a user is a bot or not** and whether **a tweet is political or not**.

In [None]:
PATH_TO_DOWNLOADED = DATA_PATH # 'D:/Users/suuser/Desktop/Sabancı/CS412/spring-2022/project/'

## 1.1. Political Tweet Detection
This part stands for the feature extraction of tweets. We start with collecting the raw data from *tweet_metadata*, then use some of them to extract features.

### 1.1.1. Get Raw Data

#### 1.1.1.1. Check if Retweet

In [None]:
def check_if_retweet(tweet_metadata_line):
    is_retweet = 0
    retweeted_username = None

    try:
        tweet_metadata_line['retweeted_status']
        retweeted_username = tweet_metadata_line['retweeted_status']['user']['screen_name'].lower()
        is_retweet = 1

    except KeyError:
        pass

    return is_retweet, retweeted_username

#### 1.1.1.2. Get Tweet Text

In [None]:
def get_tweet_text(tweet_metadata_line):
    text = tweet_metadata_line['text']
    
    return text

#### 1.1.1.3. Get Tweet ID

In [None]:
def get_tweet_id(tweet_metadata_line):
    id_str = tweet_metadata_line['id_str']
    
    return id_str

#### 1.1.1.4. Get Number of Mentions and Hashtags

In [None]:
def get_number_mentions_hashtags(tweet_metadata_line):
    num_mentions = len(tweet_metadata_line['entities']['user_mentions'])
    num_hashtags = len(tweet_metadata_line['entities']['hashtags'])

    return num_mentions, num_hashtags

#### 1.1.1.5. Get Number of Retweets and Favorites

In [None]:
def get_number_retweets_favorites(tweet_metadata_line):
    retweet_count = tweet_metadata_line['retweet_count']
    favorite_count = tweet_metadata_line['favorite_count']
    
    return retweet_count, favorite_count

#### 1.1.1.6. Get User Info

In [None]:
def get_user_info(tweet_metadata_line):
    id = tweet_metadata_line['user']['id_str']
    screen_name = tweet_metadata_line['user']['screen_name'].lower()
    description = tweet_metadata_line['user']['description']

    return id, screen_name, description

### 1.1.2. Derive Manually Crafted Features

#### 1.1.2.1. Check for political entity in text

In [None]:
def check_political_ent(text, checkIn):
    text = text.lower()

    # These keywords have been drastically reduced due to public access from github!
    # Possible keywords that can be found in political tweets.
    tweet_entities = [' ak ', 'akp', 'chp', 'hdp', 'mhp', 'iyi part', "ak parti" ]


    # These keywords have been drastically reduced due to public access from github!
    # Possible keywords that can be found in the profile descriptions of people who regularly tweet politically.    
    desc_entities = [' ak ', 'akp', 'chp', 'mhp', 'hdp', 'iyi part',  "ak parti"]


    # These keywords have been drastically reduced due to public access from github!
    # Has this tweet been retweeted by people who mostly tweeted politically?
    retweet_from_entities = ['meral_aksener', 'kilicdarogluk', 'vekilince', 'RTE', 'rte', "recep tayyip", 'tayyip' ]
                        

    entities_in_text = []

    if checkIn == 'retweetedFrom':
        entities_in_text = [ent for ent in retweet_from_entities if ent.lower() in text]
        if len(entities_in_text) >= 1:
            return 1

    elif checkIn == 'tweet':
      entities_in_text = [ent for ent in retweet_from_entities+tweet_entities if ent.lower() in text]

    elif checkIn == 'desc':
      entities_in_text = [ent for ent in desc_entities if ent.lower() in text]

    return len(entities_in_text)

#### 1.1.2.2. Number of total interactions

In [None]:
def total_interactions(retweet_count, favorite_count):
    total_num_interactions = retweet_count + favorite_count
    
    return total_num_interactions

In [None]:
def political_word_total_word_ratio(text):
    num_political_entities = check_political_ent(text, 'tweet')
    total_word = len(set(text.split()))

    return num_political_entities/total_word

### 1.1.2. Collect data using the functions above and transform into a Pandas DataFrame

In [None]:
dfPolitical = {'tweet_id':[],
              'is_retweet':[],
              'retweeted_username':[],
              'text':[],
              'num_mentions':[],
              'num_hashtags':[],
              'num_retweets':[],
              'num_favorites':[],
              'user_id':[],
              'user_screen_name':[],
              'user_description':[],           
              'num_political_entities_tweet':[],
              'num_political_entities_description': [],
              'retweeted_political': [],
              'political_word_ratio': [],             
              'total_interactions':[]}


with gzip.open(f"{PATH_TO_DOWNLOADED}tweet_metadata.jsons.gz", "rb") as f:
    for line in f:
        line = json.loads(line)
        
        # raw data:
        id_str = get_tweet_id(line)
        is_retweet, retweeted_username = check_if_retweet(line)
        text = get_tweet_text(line)
        num_mentions, num_hashtags = get_number_mentions_hashtags(line)
        retweet_count, favorite_count = get_number_retweets_favorites(line)
        user_id_str, screen_name, user_description = get_user_info(line)
  
        # manually crafted data:
        total_num_interactions = total_interactions(retweet_count, favorite_count)
        retweeted_political = bool(check_political_ent(retweeted_username, 'retweetedFrom')) if is_retweet else False
        num_political_entities = check_political_ent(text, 'tweet')
        num_political_entities_in_description = check_political_ent(user_description, 'desc')
        political_word_ratio = political_word_total_word_ratio(text)


        dfPolitical['tweet_id'].append(id_str)
        dfPolitical['is_retweet'].append(is_retweet)
        dfPolitical['retweeted_username'].append(retweeted_username)
        dfPolitical['text'].append(text)
        dfPolitical['num_mentions'].append(num_mentions)
        dfPolitical['num_hashtags'].append(num_hashtags)
        dfPolitical['num_retweets'].append(retweet_count)
        dfPolitical['num_favorites'].append(favorite_count)
        dfPolitical['user_id'].append(user_id_str)
        dfPolitical['user_screen_name'].append(screen_name)
        dfPolitical['user_description'].append(user_description)
        dfPolitical['total_interactions'].append(total_num_interactions)
        dfPolitical['num_political_entities_tweet'].append(num_political_entities)
        dfPolitical['num_political_entities_description'].append(num_political_entities_in_description)
        dfPolitical['retweeted_political'].append(retweeted_political)
        dfPolitical['political_word_ratio'].append(political_word_ratio)

In [None]:
dfPolitical = pd.DataFrame(dfPolitical)
dfPolitical

Unnamed: 0,tweet_id,is_retweet,retweeted_username,text,num_mentions,num_hashtags,num_retweets,num_favorites,user_id,user_screen_name,user_description,num_political_entities_tweet,num_political_entities_description,retweeted_political,political_word_ratio,total_interactions
0,1588568792984346624,0,,"Sosyal Hizmetin temelini çocuk oluşturur,çocuğ...",0,0,49,98,920963718103650304,maviruh_,shu/\nburaya afilli bir söz yazdığımı varsayın,0,0,False,0.000000,147
1,1588452263047069697,0,,"@mahirunal Gavur İzmir ya onlar, hani Cumhuriy...",1,0,0,0,595514060,mtfdan,,0,0,False,0.000000,0
2,1569589330544398336,0,,#ŞehitAdayıUzmÇvşaKadro\nSiz İstesenizde Istem...,0,1,0,0,1356375754561490947,ahsucilginuzman,Vatan Sevdalisi,0,0,False,0.000000,0
3,1570428119609139201,0,,@ajans_muhbir Siz kaypak olmayıp onay vermesey...,1,0,0,0,1478775431008595968,hamitelkelle,HighOne,0,0,False,0.000000,0
4,1551163840368414722,0,,Engelli öğretmenler olarak önümüzdeki engeller...,0,0,0,0,1511976696337113088,sed58417690,,0,0,False,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33528,1568595408233832448,0,,Gerçek kimlik taşımayan hesaplara cevap vermem...,0,0,9,81,576247173,ardanzenturk,RT ONAYLADIĞIM ANLAMINA GELMEZ\nArtık fikirler...,0,0,False,0.000000,90
33529,1584027427696959488,0,,@umitozdag Neden Suriyelilerle ilgili bu kadar...,1,0,1,8,162308585,ozgul_61,Bridge design engineer Yaay hesabı : dilfiruz,0,0,False,0.000000,9
33530,1585945783307730945,0,,@celebimehmeta Niye Türkiye yüzyılıda.Türkiye ...,1,0,0,1,415025519,ladrekova,,0,0,False,0.000000,1
33531,1569748909521801221,1,muazzezeralp,RT @muazzezeralp: @Doan58213655 @denizkonur @N...,7,1,6,0,1442125177727307781,yapikytgrivrlsn,,2,0,False,0.181818,6


## 1.2. From Users

### 1.2.1. Get user metadata from user_profiles.jsons.gz

#### 1.2.1.1. Get user info metadata

In [None]:
def get_user_info_metadata(user_metadata_line):
    
    user_id = user_metadata_line['id_str']
    user_name = user_metadata_line['name']
    user_screen_name = user_metadata_line['screen_name'].lower()
    user_location = user_metadata_line['location']
    user_description = user_metadata_line['description']
    user_followers_count = user_metadata_line['followers_count']
    user_friends_count = user_metadata_line['friends_count']  
    user_is_verified = user_metadata_line['verified']
    user_has_default_photo = user_metadata_line['default_profile_image']
    user_statuses_count = user_metadata_line['statuses_count']

    
    dictionary = {'user_id':user_id, 'user_name': user_name, 'user_screen_name':user_screen_name, 'user_location':user_location,
     'user_description':user_description, 'user_followers_count':user_followers_count, 'user_friends_count':user_friends_count, 
     'user_statuses_count': user_statuses_count, 'user_is_verified':user_is_verified, 'user_has_default_photo': user_has_default_photo}

    return dictionary

#### 1.2.1.2. Get followers/(followers+friends) ratio

In [None]:
def get_followers_all_ratio(user_followers_count, user_friends_count):
    
    if user_friends_count + user_followers_count == 0:
        followers_all_ratio = 0

    else:
        followers_all_ratio =  user_followers_count / (user_friends_count + user_followers_count)

    return followers_all_ratio

#### 1.2.1.3. Get description length

In [None]:
def get_desc_len(user_description):
    
    description_len = len(user_description)

    return description_len

In [None]:
dfBot = {'user_id':[],
         'user_name':[],
         'user_screen_name':[],
         'user_location':[],
         'user_description':[],
         'user_followers_count':[],
         'user_friends_count':[],
         'user_statuses_count':[],
         'description_len':[],
         'followers_to_all_ratio':[],
         'user_is_verified':[],
         'user_has_default_photo': []
         }

with gzip.open(f"{PATH_TO_DOWNLOADED}user_profiles.jsons.gz", "rb") as f:
    for line in f:
        line = json.loads(line)

        dictionary = get_user_info_metadata(line)
        for k,v in dictionary.items():
            dfBot[k].append(v)

        
        # manually crafted data:
        description_len = get_desc_len(dictionary['user_description'])
        dfBot['description_len'].append(description_len)
        
        followers_all_ratio = get_followers_all_ratio(dictionary['user_followers_count'], 
                                                      dictionary['user_friends_count'])

        dfBot['followers_to_all_ratio'].append(followers_all_ratio)

In [None]:
dfBot = pd.DataFrame(dfBot)
dfBot

Unnamed: 0,user_id,user_name,user_screen_name,user_location,user_description,user_followers_count,user_friends_count,user_statuses_count,description_len,followers_to_all_ratio,user_is_verified,user_has_default_photo
0,1431241870848450577,Nasreena Khan Wazir,nasreenakhan006,"Islamabad, Pakistan",Student,65,185,2551,7,0.260000,False,False
1,1304340303080386560,fania :((((,scorpiehoez,bogor,have a holly jolly🎄,8235,3011,42771,19,0.732260,False,False
2,1116042038577958914,Yusuf Aksoy,yusufak63712920,,"Bir şeyden pişmanlık duymak istemiyorsan,her ş...",95,399,14300,64,0.192308,False,False
3,4859899931,Be (VIXX6) ama oppalarının düğününe gidemiyor,nedenburdaysam,Hufflepuff ortak salon,"SMStan\n/St☆rlight ///come on girls,this is ou...",40,83,21303,65,0.325203,False,False
4,2225373636,SLMDMR,biologselim,,BİYOLOG🔬🦠\nNanoteknoloji,100,98,1629,23,0.505051,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
29665,1320834618220781569,islammm,islam_mrsj,,,229,217,2396,0,0.513453,False,False
29666,111074128,Melda Onur,meldaonur,,"Şekersiz çay, etsiz sofra, SAVAŞSIZ dünya... 🐌...",212457,5428,75178,100,0.975088,False,False
29667,36946875,ali ydm,ali_ydm,"İstanbul, Türkiye",hayat oyunsa bende jeton çok,116,141,6482,28,0.451362,False,False
29668,2389587396,Türkan Usta,turkanusta,"Ankara, Türkiye",Ustaya sormuşlar; hayatta yaptığın en büyük is...,1669,3639,121113,116,0.314431,False,False


### 1.2.2. Get Tweet Info of Users in user_profiles.jsons.gz

#### 1.2.2.1. Check ratio of retweets to all tweets

In [None]:
def get_retweet_tweet_ratio(line):
    number_retweets = 0
    number_original_tweets = 0

    for tweet in line['tweets']:
        try:
            tweet['retweeted_status']
            number_retweets += 1
                
        except:
            number_original_tweets += 1
            
    total_tweets = number_retweets + number_original_tweets
    
    if total_tweets == 0:
        retweet_total_ratio = None
    else:
        retweet_total_ratio = number_retweets/(total_tweets)
    
    return retweet_total_ratio

#### 1.2.2.2. Check median number of favorites

In [None]:
def get_median_number_favorites(line):
    num_median_favorites = np.median([tweet['favorite_count'] for tweet in line['tweets']])

    return num_median_favorites

### 1.2.3. Collect data using the functions above and transform into a Pandas DataFrame

In [None]:
dfBotTweets = {'user_id':[],
               'retweet_total_ratio':[],
               'num_median_favorites':[],
               'num_of_tweets':[]
              }

i = 0

with gzip.open(f"{PATH_TO_DOWNLOADED}user_tweets.jsons.gz", "rb") as f:
    for line in f:

        line = json.loads(line)

        user_id = line['user_id']
        dfBotTweets['user_id'].append(user_id)
        
        retweet_total_ratio = get_retweet_tweet_ratio(line)
        dfBotTweets['retweet_total_ratio'].append(retweet_total_ratio)
        
        num_median_favorites = get_median_number_favorites(line)
        dfBotTweets['num_median_favorites'].append(num_median_favorites)
        
        dfBotTweets['num_of_tweets'].append(len(line['tweets']))

        i += 1
        if i % 1000 == 0:
            print(i)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000


In [None]:
dfBotTweets = pd.DataFrame(dfBotTweets)
dfBotTweets

Unnamed: 0,user_id,retweet_total_ratio,num_median_favorites,num_of_tweets
0,594642154,0.115000,2.0,200
1,525600289,0.005025,1.0,199
2,931895965501534209,0.900000,0.0,200
3,1591543462746329088,0.185000,0.0,200
4,734801354749796352,1.000000,0.0,200
...,...,...,...,...
28310,1591370361488252928,0.800000,0.0,200
28311,1475272459616235525,0.825000,0.0,200
28312,1096753792731750401,0.051020,1.0,196
28313,1269527617687953409,0.095000,2.0,200


### 1.2.3. Merge dfBot and dfBotTweets

In [None]:
dfBotAll = dfBot.merge(dfBotTweets,
                       how='left')

dfBotAll[['retweet_total_ratio', 'num_median_favorites']] = dfBotAll[['retweet_total_ratio', 'num_median_favorites']].fillna(0)

dfBotAll

Unnamed: 0,user_id,user_name,user_screen_name,user_location,user_description,user_followers_count,user_friends_count,user_statuses_count,description_len,followers_to_all_ratio,user_is_verified,user_has_default_photo,retweet_total_ratio,num_median_favorites,num_of_tweets
0,1431241870848450577,Nasreena Khan Wazir,nasreenakhan006,"Islamabad, Pakistan",Student,65,185,2551,7,0.260000,False,False,0.395939,0.0,197.0
1,1304340303080386560,fania :((((,scorpiehoez,bogor,have a holly jolly🎄,8235,3011,42771,19,0.732260,False,False,0.125000,0.0,200.0
2,1116042038577958914,Yusuf Aksoy,yusufak63712920,,"Bir şeyden pişmanlık duymak istemiyorsan,her ş...",95,399,14300,64,0.192308,False,False,0.910000,0.0,200.0
3,4859899931,Be (VIXX6) ama oppalarının düğününe gidemiyor,nedenburdaysam,Hufflepuff ortak salon,"SMStan\n/St☆rlight ///come on girls,this is ou...",40,83,21303,65,0.325203,False,False,0.015306,1.0,196.0
4,2225373636,SLMDMR,biologselim,,BİYOLOG🔬🦠\nNanoteknoloji,100,98,1629,23,0.505051,False,False,0.659898,0.0,197.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29665,1320834618220781569,islammm,islam_mrsj,,,229,217,2396,0,0.513453,False,False,0.015000,1.0,200.0
29666,111074128,Melda Onur,meldaonur,,"Şekersiz çay, etsiz sofra, SAVAŞSIZ dünya... 🐌...",212457,5428,75178,100,0.975088,False,False,0.291457,2.0,199.0
29667,36946875,ali ydm,ali_ydm,"İstanbul, Türkiye",hayat oyunsa bende jeton çok,116,141,6482,28,0.451362,False,False,0.061538,0.0,195.0
29668,2389587396,Türkan Usta,turkanusta,"Ankara, Türkiye",Ustaya sormuşlar; hayatta yaptığın en büyük is...,1669,3639,121113,116,0.314431,False,False,0.995000,0.0,200.0


# 2. TRAIN MODEL

## 2.1. Political Tweet Prediction

### 2.1.1. Merge dfPolitical data with labels

In [None]:
dfPoliticalAll_train = dfPolitical.merge(trainingTweetDf,
                                         on='tweet_id')

dfPoliticalAll_train.head()

Unnamed: 0,tweet_id,is_retweet,retweeted_username,text,num_mentions,num_hashtags,num_retweets,num_favorites,user_id,user_screen_name,user_description,num_political_entities_tweet,num_political_entities_description,retweeted_political,political_word_ratio,total_interactions,isPolitical,topics
0,1588568792984346624,0,,"Sosyal Hizmetin temelini çocuk oluşturur,çocuğ...",0,0,49,98,920963718103650304,maviruh_,shu/\nburaya afilli bir söz yazdığımı varsayın,0,0,False,0.0,147,Yes,"['Unemployment', 'Education']"
1,1588452263047069697,0,,"@mahirunal Gavur İzmir ya onlar, hani Cumhuriy...",1,0,0,0,595514060,mtfdan,,0,0,False,0.0,0,Yes,['Democracy']
2,1569589330544398336,0,,#ŞehitAdayıUzmÇvşaKadro\nSiz İstesenizde Istem...,0,1,0,0,1356375754561490947,ahsucilginuzman,Vatan Sevdalisi,0,0,False,0.0,0,Yes,"['Terrorism', 'Nationalsecurity']"
3,1570428119609139201,0,,@ajans_muhbir Siz kaypak olmayıp onay vermesey...,1,0,0,0,1478775431008595968,hamitelkelle,HighOne,0,0,False,0.0,0,Yes,"['Crime&justice', 'Nationalsecurity']"
4,1551163840368414722,0,,Engelli öğretmenler olarak önümüzdeki engeller...,0,0,0,0,1511976696337113088,sed58417690,,0,0,False,0.0,0,Yes,"['Unemployment', 'Minorities&discrimination']"


### 2.1.2. Separate X and y values
We only use 3 features here to create a baseline model. However, it is not enough to get good results.

In [None]:
X = dfPoliticalAll_train[['num_political_entities_tweet', 'num_political_entities_description', 'retweeted_political', 'num_mentions', 'total_interactions', 'num_hashtags', 'political_word_ratio']]
y = dfPoliticalAll_train['isPolitical'].apply(lambda x: 1 if x=='Yes' else 0)

### 2.1.3. Train - validation split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

### 2.1.4. Train the model

Here, you may use different models such as neural networks, XGBoost, AdaBoost, RandomForest, Linear Regression, Logistic Regression etc. to see which model does the best. Also, you can use grid_search_cv() or a basic for loop to optimize the hyperparameters of your model.

In [None]:
# Hyper parameter optimization for isPolitical
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, make_scorer
import xgboost as xgb

mse = make_scorer(mean_squared_error, greater_is_better=False)
params = {
    'max_depth': range(3, 10, 2),
    'min_child_weight':range(1,6,2),
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)],
    'gamma':[i/10.0 for i in range(0,5)]
}

# create an instance
xgb_reg = xgb.XGBRegressor(
    objective='binary:logistic'
)

# grid search the model
grid_search_political = GridSearchCV(estimator = xgb_reg, param_grid= params, n_jobs = 4, cv = 5, verbose = True, scoring=mse)

# fit your model
grid_search_political.fit(X_train, y_train)

grid_search_political.best_estimator_

Fitting 5 folds for each of 960 candidates, totalling 4800 fits


XGBRegressor(colsample_bytree=0.6, gamma=0.3, min_child_weight=5,
             objective='binary:logistic', subsample=0.7)

In [None]:
# make predictions
preds = grid_search_political.predict(X_valid)

# evaluate on validation set
mse_political = mean_squared_error(y_valid, preds)

print("MSE:", mse_political)

MSE: 0.1878227082008767


## 2.2. Bot Detection

### 2.2.1. Merge dfBotAll data with labels

In [None]:
dfBotAll.user_screen_name = dfBotAll.user_screen_name.str.lower()

In [None]:
dfBotAll_train = dfBotAll.merge(trainingUserDf,
                               left_on='user_screen_name',
                               right_on='screen_name')

dfBotAll_train

Unnamed: 0,user_id,user_name,user_screen_name,user_location,user_description,user_followers_count,user_friends_count,user_statuses_count,description_len,followers_to_all_ratio,user_is_verified,user_has_default_photo,retweet_total_ratio,num_median_favorites,num_of_tweets,screen_name,isBot,gender
0,1431241870848450577,Nasreena Khan Wazir,nasreenakhan006,"Islamabad, Pakistan",Student,65,185,2551,7,0.260000,False,False,0.395939,0.0,197.0,nasreenakhan006,No,Male
1,1304340303080386560,fania :((((,scorpiehoez,bogor,have a holly jolly🎄,8235,3011,42771,19,0.732260,False,False,0.125000,0.0,200.0,scorpiehoez,No,Female
2,1116042038577958914,Yusuf Aksoy,yusufak63712920,,"Bir şeyden pişmanlık duymak istemiyorsan,her ş...",95,399,14300,64,0.192308,False,False,0.910000,0.0,200.0,yusufak63712920,Yes,Male
3,4859899931,Be (VIXX6) ama oppalarının düğününe gidemiyor,nedenburdaysam,Hufflepuff ortak salon,"SMStan\n/St☆rlight ///come on girls,this is ou...",40,83,21303,65,0.325203,False,False,0.015306,1.0,196.0,nedenburdaysam,Yes,Female
4,2225373636,SLMDMR,biologselim,,BİYOLOG🔬🦠\nNanoteknoloji,100,98,1629,23,0.505051,False,False,0.659898,0.0,197.0,biologselim,No,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3195,1286770207134973954,Hamide Arabacı,anka6054,,,151,61,5288,0,0.712264,False,False,0.000000,1.0,200.0,anka6054,No,
3196,1598032338323214338,atamabekleyenbahceci,atamabekleyenzz,,,173,367,311,0,0.320370,False,False,0.580000,0.0,200.0,atamabekleyenzz,No,
3197,760235343966863360,Emrah İNCİ,memrahinci,Istanbul - Bayburt,Researcher | Middle East | Political Science |...,5863,5905,1029,71,0.498215,False,False,0.040000,36.0,200.0,memrahinci,No,
3198,1553973684100124672,Murat Kkk,muratkkk18,,Normal sıradan bir insanım,1,10,18,26,0.090909,False,False,0.769231,0.0,13.0,muratkkk18,No,


In [None]:
trainingUserDf.isBot.value_counts()

No          2523
Yes          672
Not sure       5
Name: isBot, dtype: int64

### 2.2.2. Separate X and y values
We use only 4 features here to create a baseline model. However, it is not enough to get good results.

In [None]:
X = dfBotAll_train[['description_len', 'followers_to_all_ratio', 'retweet_total_ratio', 'num_median_favorites']]
y = dfBotAll_train.isBot.apply(lambda x: 1 if x=='Yes' else 0)

### 2.2.3. Train-test split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

### 2.2.4. Train the model

In [None]:
# Hyper parameter optimization for isBot
mse = make_scorer(mean_squared_error, greater_is_better=False)
params = {
    'max_depth': range(3, 10, 2),
    'min_child_weight':range(1,6,2),
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)],
    'gamma':[i/10.0 for i in range(0,5)]
}

# create an instance
xgb_reg = xgb.XGBRegressor(
    objective='binary:logistic'
)

# grid search the model
grid_search_bot = GridSearchCV(estimator = xgb_reg, param_grid= params, n_jobs = 4, cv = 5, verbose = True, scoring=mse)

# fit your model
grid_search_bot.fit(X_train, y_train)

grid_search_bot.best_estimator_

Fitting 5 folds for each of 960 candidates, totalling 4800 fits


XGBRegressor(colsample_bytree=0.6, gamma=0.4, min_child_weight=5,
             objective='binary:logistic', subsample=0.6)

In [None]:
# make predictions
preds = grid_search_bot.predict(X_valid)

# evaluate on validation set
mse_bot = mean_squared_error(y_valid, preds)

print("MSE:", mse_bot)

MSE: 0.1611803155558201


# 3. MAKE PREDICTIONS

Here, you will make predictions with the models that you have trained above.

## 3.1. Predictions for Tweets (Political or Not)

In [None]:
# read the evaluation file as follows
evaluationTweetDf = pd.read_csv('{}evaluation-round1-tweet.csv'.format(DATA_PATH), dtype={0: str}, header=None, names=['tweet_id'])
evaluationTweetDf = evaluationTweetDf.dropna()

# merge it with the political dataframe so that you can use the make predictions based on the variables
dfPolitical_test = dfPolitical.merge(evaluationTweetDf)

# define X as we did above in section (2.x.2. Separate X and y values)
X = dfPolitical_test[['num_political_entities_tweet', 'num_political_entities_description', 'retweeted_political', 'num_mentions', 'total_interactions', 'num_hashtags', 'political_word_ratio']]

# make predictions based on these variables
predictions_political = grid_search_political.predict(X)

### This part is important! We expect you to return your predictions in the following format:

In [None]:
modelPredTweet = dict([(x,float(y)) for x,y in zip([*dfPolitical_test.tweet_id], predictions_political)])
modelPredTweet

{'1593649159009099777': 0.8801901340484619,
 '1367571642604544000': 0.6399672627449036,
 '1589993032975544320': 0.9224320650100708,
 '1565312596135354373': 0.8678908944129944,
 '1388235183653011462': 0.681520938873291,
 '1592120408073203712': 0.27374565601348877,
 '1439547067337256967': 0.6425844430923462,
 '1597274845381029888': 0.9381685853004456,
 '1586021183958704128': 0.681520938873291,
 '1356926480605982728': 0.49946075677871704,
 '1595357036925026306': 0.9224320650100708,
 '1585766233491886081': 0.681520938873291,
 '1595871258985615361': 0.8821505904197693,
 '1352635736537882629': 0.6954212188720703,
 '1583477966373543936': 0.933241069316864,
 '1564926450096013313': 0.4066222012042999,
 '1585634359612420101': 0.9243434071540833,
 '1384499047390658560': 0.25023216009140015,
 '1596583748669419521': 0.6654172539710999,
 '1391681495622995971': 0.25023216009140015,
 '1365710259549966339': 0.3976902663707733,
 '1590673118397624323': 0.9407147169113159,
 '1389951943343316995': 0.495026

## 3.2. Predictions for Users (Bot or Not)

In [None]:
evaluationUserDf = pd.read_csv('{}evaluation-round1-user.csv'.format(DATA_PATH), dtype={0: str}, header=None, names=['user_screen_name'])
evaluationUserDf = evaluationUserDf.dropna()

# merge it with the political dataframe so that you can use the make predictions based on the variables
dfBot_test = dfBotAll.merge(evaluationUserDf)

# define X as we did above in section (2.x.2. Separate X and y values)
X = dfBot_test[['description_len', 'followers_to_all_ratio', 'retweet_total_ratio', 'num_median_favorites']]

# make predictions based on these variables
predictions_bot = grid_search_bot.predict(X)

In [None]:
modelPredUser = dict([(x,float(y)) for x,y in zip([*dfBot_test.user_screen_name], predictions_bot)])
modelPredUser

{'nedenburdaysam': 0.12919969856739044,
 'biologselim': 0.3814277648925781,
 'alaraaynncnm': 0.15041561424732208,
 '_sydneycarton_': 0.15390081703662872,
 'denizlihabercom': 0.10295960307121277,
 'burakerbaychp': 0.027774780988693237,
 'mustafaarst': 0.048553939908742905,
 'mvnez': 0.19919730722904205,
 'qara118': 0.08183334767818451,
 'alpar_kaan': 0.18369080126285553,
 'farukhalit2': 0.08027489483356476,
 'haf_zhan': 0.2891484498977661,
 'harlunoshi': 0.13824234902858734,
 'heritagepaix': 0.24072960019111633,
 '37baho37': 0.07806546986103058,
 'tamerduran_1': 0.17108385264873505,
 'donkisotumsu': 0.15193375945091248,
 'nuranwolf': 0.08164294064044952,
 'politikgundem': 0.12981922924518585,
 'isakethudax': 0.03294810280203819,
 'ilaydejaneiro': 0.0896638035774231,
 'gendenmukatol': 0.06032172963023186,
 '1905anason': 0.6709826588630676,
 'eraydurgut03': 0.1603109985589981,
 'dasiskein': 0.09793075919151306,
 'mett_1907': 0.09794900566339493,
 'semihyeteer': 0.09061309695243835,
 'habe

# PREPARE SUBMISSION

You will need to submit exact same file produced by using the following code. Any deviation from the desired format willbe marked as 0.

In [None]:
# Explain your approach

data_explanations = '''
To explain how I handle data for training, I can include information about the following:

Data acquisition: First of all, I downloaded the data that I had annotated from Onur teacher's website and installed it,
then added additional data. I read these from csv file also I concatenated and droped duplicates finally reindexed the data

Data cleaning: I droped duplicate values
Data splitting: for is political x training values: 'num_political_entities_tweet', 'num_political_entities_description', 'retweeted_political', 'num_mentions', 'total_interactions', 'num_hashtags', 'political_word_ratio'
y training value obviously isPolitical 
and for bot detection x training values 'description_len', 'followers_to_all_ratio', 'num_median_favorites', 'user_friends_count', 'user_statuses_count', 'user_followers_count', 'user_has_default_photo', 'user_is_verified', 'num_retweet']]
y training value is isBot
'''

feature_explanations = '''
The code you provided appears to define a grid search cross-validation (CV) procedure for hyperparameter optimization using the XGBoost library. The XGBoost model is a gradient boosting model for binary classification, as indicated by the objective parameter set to 'binary:logistic'. The model is trained to predict whether a tweet is political or not (isPolitical).

The features used to train the model are:

'num_political_entities_tweet': The number of political entities in the tweet text.
'num_political_entities_description': The number of political entities in the user's description.
'retweeted_political': A binary feature indicating whether the tweet is a political retweet.
'num_mentions': The number of user mentions in the tweet.
'total_interactions': The total number of interactions (likes and retweets) the tweet has received.
'num_hashtags': The number of hashtags used in the tweet.
'political_word_ratio': The ratio of political words to total words in the tweet.
These features are used to predict the target variable 'isPolitical', which is a binary variable indicating whether the tweet is political (1) or not (0). The model is trained using a train-test split of the data, with 80% of the data used for training and 20% used for validation. The grid search procedure searches a defined parameter grid to find the optimal combination of hyperparameters for the model, using the mean squared error (MSE) as the evaluation metric. The best estimator, or the model with the optimal combination of hyperparameters, is then stored in the best_estimator_ attribute of the grid search object.
'''

model_explanations = '''
The code I provided to define a grid search cross-validation (CV) procedure for hyperparameter optimization using the XGBoost library.

The XGBoost model used in this code is a gradient boosting model for binary classification, as indicated by the objective parameter set to 'binary:logistic'. The model is trained to predict whether an example is a bot or not (isBot).
'''

additional_explanations = '''
I did not use any trick but next time I will be change test_size or random_state with different values
'''


In [None]:
predictions = {
    'round': ROUND,
    'student_id': STUDENT_ID,
    'user_predictions': modelPredUser,
    'tweet_predictions': modelPredTweet,
    'explanations': {
        'data': data_explanations,
        'feature': feature_explanations,
        'model': model_explanations,
        'other': additional_explanations,
    }
}


with open('predictions-{}_round{}.json'.format(STUDENT_ID, ROUND), 'w') as fl:
    fl.write(json.dumps(predictions, indent=4))

In [None]:
# Test your submission file

submission = json.load(open('predictions-{}_round{}.json'.format(STUDENT_ID, ROUND), 'r'))
submission

{'round': 1,
 'student_id': '22534',
 'user_predictions': {'nedenburdaysam': 0.12919969856739044,
  'biologselim': 0.3814277648925781,
  'alaraaynncnm': 0.15041561424732208,
  '_sydneycarton_': 0.15390081703662872,
  'denizlihabercom': 0.10295960307121277,
  'burakerbaychp': 0.027774780988693237,
  'mustafaarst': 0.048553939908742905,
  'mvnez': 0.19919730722904205,
  'qara118': 0.08183334767818451,
  'alpar_kaan': 0.18369080126285553,
  'farukhalit2': 0.08027489483356476,
  'haf_zhan': 0.2891484498977661,
  'harlunoshi': 0.13824234902858734,
  'heritagepaix': 0.24072960019111633,
  '37baho37': 0.07806546986103058,
  'tamerduran_1': 0.17108385264873505,
  'donkisotumsu': 0.15193375945091248,
  'nuranwolf': 0.08164294064044952,
  'politikgundem': 0.12981922924518585,
  'isakethudax': 0.03294810280203819,
  'ilaydejaneiro': 0.0896638035774231,
  'gendenmukatol': 0.06032172963023186,
  '1905anason': 0.6709826588630676,
  'eraydurgut03': 0.1603109985589981,
  'dasiskein': 0.097930759191513