# Birds of a Feather Flock Together: A Recommender System for Brand-Influencer Matching

**MSDS 2023 Term 3 LT9** | Albao, Delfin, Lazaro, Lucas, Menorca

This notebook serves as a supplementary to the main report on the details of the data scraping done for this project.

In this project, the users followed by an account is considered as its social network. Moving forward, "following" and "social network" may be used interchangeably.

**Data Source**

All of the datasets used in this project was gathered via Twitter API. It includes the profile of Twitter users, their tweets, and their social network.

In [None]:
import requests
import numpy as np
import pandas as pd

from getpass import getpass
from sqlalchemy import create_engine
import sqlite3
import time

# I. Preliminaries

In [None]:
# Create the database in which the collected data will be stored
sqlite_db = 'dmw2_final_project.db'
sqlite_conn = f'sqlite:///{sqlite_db}'
engine = create_engine(f'sqlite:///{sqlite_db}')

In [None]:
# Input the bearer token of your Twitter Dev Account to be able to scrape
bearer_token = getpass()

## II. The Brand's Profile: *Dove*

In [None]:
# Input the brand's Twitter username
brand_username = 'Dove_PH'
try:
    response = (requests
                    .get(f'https://api.twitter.com/2/users/by/username/{brand_username}',
                         headers={'Authorization': f'Bearer {bearer_token}'},
                         params={"user.fields": "id,created_at,description," \
                                 "location,name,username,url,verified," \
                                 "public_metrics"})
                    .json()
                    )
except:
    print("Invalid request: check username input or bearer_token")

# Store the information into a pandas dataframe
brand_info = response.get('data')
df_brand = pd.DataFrame([brand_info])
df_brand = (df_brand.drop('public_metrics', axis=1)
                    .join(pd.json_normalize(df_brand.public_metrics)))

# # Save the information into the sqlite db
# tbl_brand = 'brands'
# df_brand.to_sql(tbl_brand, sqlite_conn, index=False, if_exists='replace')

# III. Social Network of Dove

*Dove*'s social network was used as a reference on what characteristics does *Dove* look for in an influencer.

In [None]:
# Endpoint for getting the users that follow a given ID
url = f'https://api.twitter.com/2/users/{brand_id}/following'

# Set the parameters
params = {'max_results': 1000,
          'user.fields': 'id,username,created_at,location,description,' \
                         'public_metrics,url,name,protected'}
headers = {'Authorization': f'Bearer {bearer_token}'}

following_ids = []
start = time.time()
page = 1

# Loop through all of the pages to get all the following
while True:
    if page == 1:
        print(f"At page {page}")
    elif page % 2 == 0:
        print(f"At page {page}")
                
    r = requests.get(url, headers=headers, params=params).json()
    following_ids.extend(r['data'])

    if 'next_token' in r['meta']:
        params.update({'pagination_token': r['meta']['next_token']})
    else:
        break
    page += 1
    time.sleep(1)

end_time = time.time() - start
print(f"Total scrape time is {end_time:.2f} seconds")

In [None]:
# Store the collected data into a pandas dataframe
df_following = pd.DataFrame(following_ids)
df_following = (df_following.drop('public_metrics', axis=1)
                            .join(pd.json_normalize(df_following.public_metrics)))

# Defining an influencer as those with at least 50K followers
valid_following_ids = (df_following.loc[(df_following.followers_count >= 50000)
                                       & ~(df_following.username.str.contains('Dove'))]
                                           .id.unique().tolist())
df_following.loc[df_following.loc[df_following.id.isin(valid_following_ids)].index, 'included'] = 1
df_following.included.fillna(0, inplace=True)

# Save the scraped info into the sqlite db
tbl_following = 'following'
df_following.to_sql(tbl_following, con=sqlite_conn, index=False, if_exists='replace')

# III. Influencers

## Reference Influencer 1: *Anne Curtis*

### Profile

In [None]:
# Input the influencers' Twitter username
ref_influencer_username = 'annecurtissmith'
try:
    response = (requests
                    .get(f'https://api.twitter.com/2/users/by/username/{ref_influencer_username}',
                         headers={'Authorization': f'Bearer {bearer_token}'},
                         params={"user.fields": "id,created_at,description," \
                                 "location,name,username,url,verified," \
                                 "public_metrics"})
                    .json()
                    )
except:
    print("Invalid request: check username input or bearer_token")

# Store info into a pandas dataframe
ref_influencer_info = response.get('data')
ref_influencer_id = ref_influencer_info['id']

df_ref_influencers = pd.DataFrame([ref_influencer_info])
df_ref_influencers = (df_ref_influencers.drop(['public_metrics'], axis=1)
                                        .join(pd.json_normalize(df_ref_influencers.public_metrics)))

# Save the dataframe into the sqlite db
tbl_ref_inf = 'ref_influencers'
df_ref_influencers.to_sql(tbl_ref_inf, sqlite_conn, index=False, if_exists='append')

### Social Network

In [None]:
# Anne following
url = f'https://api.twitter.com/2/users/{ref_influencer_id}/following'

params = {'max_results': 1000,
          'user.fields': 'id,username,created_at,location,description,' \
                         'public_metrics,url,name,protected'}
headers = {'Authorization': f'Bearer {bearer_token}'}

inf_following_ids = []

start = time.time()
page = 1

while True:
    if page == 1:
        print(f"At page {page}")
    elif page % 2 == 0:
        print(f"At page {page}")
                
    r = requests.get(url, headers=headers, params=params).json()
    inf_following_ids.extend(r['data'])
    time.sleep(2)

    if 'next_token' in r['meta']:
        params.update({'pagination_token': r['meta']['next_token']})
    else:
        break
    page += 1
    
end_time = time.time() - start
print(f"Total scrape time is {end_time:.2f} seconds")

In [None]:
# Store into a pandas dataframe
df_ref_following = pd.DataFrame(inf_following_ids)
df_ref_following = (df_ref_following.drop(['public_metrics', 'withheld'], axis=1)
                    .join(pd.json_normalize(df_ref_following.public_metrics)))

# Save into the sqlite db
tbl_influencers = 'influencers'
df_ref_following.to_sql(tbl_influencers, engine,
                            index=False, if_exists='replace')

## Reference Influencer 2: *Alden Richards*

### Profile

In [None]:
# Input influencers' Twitter username
ref_influencer_username = 'aldenrichards02'
try:
    response = (requests
                    .get(f'https://api.twitter.com/2/users/by/username/{ref_influencer_username}',
                         headers={'Authorization': f'Bearer {bearer_token}'},
                         params={"user.fields": "id,created_at,description," \
                                 "location,name,username,url,verified," \
                                 "public_metrics"})
                    .json()
                    )
except:
    print("Invalid request: check username input or bearer_token")

# Store data into pandas dataframe
ref_influencer_info = response.get('data')
ref_influencer_id = ref_influencer_info['id']

df_ref_influencers = pd.DataFrame([ref_influencer_info])
df_ref_influencers = (df_ref_influencers.drop(['public_metrics'], axis=1)
                                        .join(pd.json_normalize(df_ref_influencers.public_metrics)))
df_ref_influencers.head()

# Save into sqlite db
tbl_ref_inf = 'ref_influencers'
df_ref_influencers.to_sql(tbl_ref_inf, sqlite_conn, index=False, if_exists='append')

### Social Network

In [None]:
# Alden following
url = f'https://api.twitter.com/2/users/{ref_influencer_id}/following'

params = {'max_results': 1000,
          'user.fields': 'id,username,created_at,location,description,' \
                         'public_metrics,url,name,protected'}
headers = {'Authorization': f'Bearer {bearer_token}'}

inf_following_ids = []

start = time.time()
page = 1

while True:
    if page == 1:
        print(f"At page {page}")
    elif page % 2 == 0:
        print(f"At page {page}")
                
    r = requests.get(url, headers=headers, params=params).json()
    inf_following_ids.extend(r['data'])
    time.sleep(2)

    if 'next_token' in r['meta']:
        params.update({'pagination_token': r['meta']['next_token']})
    else:
        break
    page += 1
    
end_time = time.time() - start
print(f"Total scrape time is {end_time:.2f} seconds")

In [None]:
# Store info in pandas dataframe
df_ref_following = pd.DataFrame(inf_following_ids)
df_ref_following = (df_ref_following.drop(['public_metrics'], axis=1)
                    .join(pd.json_normalize(df_ref_following.public_metrics)))

# Save dataframe in sqlite db
tbl_influencers = 'influencers'
df_ref_following.to_sql(tbl_influencers, engine,
                            index=False, if_exists='append')

# IV. Influencer Tweets

In [None]:
# Ensure that the influencer profiles are unique, then save into the Partners table
tbl_influencers = 'influencers'
df_influencers = pd.read_sql(f"SELECT * FROM {tbl_influencers}", sqlite_conn)
df_influencers.drop_duplicates('id', keep='first', inplace=True)

tbl_partners = 'partners'
df_influencers.to_sql(tbl_partners, engine, index=False, if_exists='append')

In [None]:
# Load the partners table to get the unique partner IDs
tbl_partners = 'partners'
df_partners = pd.read_sql(f"SELECT * FROM {tbl_partners}", sqlite_conn)

# Scrape the partners' tweets
headers = {'Authorization': f'Bearer {bearer_token}'}
partner_tweets = []
start = time.time()

for idx, partner_id in enumerate(df_partners.id.unique().tolist()):
    
    print(f'Partner: {idx}')
    params = {'max_results': 100,
              'exclude': 'retweets',
              'place.fields': 'country,country_code,full_name,place_type',
              'tweet.fields': 'id,created_at,text,lang,' \
                              'conversation_id,in_reply_to_user_id,author_id,' \
                              'public_metrics,possibly_sensitive',
              'user.fields': 'id'}
    url = f'https://api.twitter.com/2/users/{partner_id}/tweets'
    
    r = requests.get(url, headers=headers, params=params).json()
    time.sleep(1.5)
    
    try:
        if r['meta']['result_count'] > 0:
            partner_tweets.extend(r['data'])
            if 'next_token' in r['meta']:
                print('Next page exists')
            else:
                print(f'No next token')
        else:
            print(f'Result = 0: {r}')

    except KeyError:
        if r['errors'][0]['title'] == 'Authorization Error':
            print(f'Authorization Error: {r}')
        else:
            print(f'Error encountered: {r}')
                   
end_time = time.time() - start
print(f"Total scrape time is {end_time:.2f} seconds")

In [None]:
# Store info into pandas dataframe
df_partner_tweets = pd.DataFrame(partner_tweets)
df_partner_tweets = (df_partner_tweets.drop(['public_metrics', 'withheld',
                                             'edit_history_tweet_ids'], axis=1)
                          .join(pd.json_normalize(df_partner_tweets.public_metrics)))

# Save dataframe into sqlite db
tbl_partner_tweets = 'partner_tweets'
df_partner_tweets.to_sql(tbl_partner_tweets, sqlite_conn,
                            index=False, if_exists='replace')