## Imports

In [1]:
from selenium import webdriver 
from selenium.webdriver import Chrome
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException
import time
import pandas as pd

## Input dataframe

In [2]:
input_df = pd.read_csv('twitter_links.csv', header = None)

In [3]:
input_df

Unnamed: 0,0
0,https://twitter.com/GTNUK1
1,https://twitter.com/whatsapp
2,https://twitter.com/aacb_CBPTrade
3,https://twitter.com/aacbdotcom
4,https://twitter.com/@AAWindowPRODUCT
5,https://www.twitter.com/aandb_kia
6,https://twitter.com/ABHomeInc
7,https://twitter.com/Abrepro
8,http://www.twitter.com
9,https://twitter.com/ACChristofiLtd


## Output dataframe

In [28]:
columns = ['URL', 'Bio', 'Following count', 'Followers count', 'Location', 'Website']
output_df = pd.DataFrame(columns = columns)

## Cleaning count

In [5]:
## to clean "K", "M", "," present in the following and followers count

def clean_count(text):
    if ',' in text:
        text = ''.join(text.split(','))
    if 'K' in text or 'M' in text:
        value, letter = float(text[:-1]), text[-1]
        if letter == 'K':
            value *= 1e3
        elif letter == 'M':
            value *= 1e6
        text = str(value)
    return text

## Extraction

In [29]:
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)
# Set an implicit wait of 20 seconds to allow time for elements to appear before throwing an exception
driver.implicitly_wait(20)
driver.get('https://twitter.com/')

In [30]:
for i in range(input_df.shape[0]):
    
    
    url = input_df.iloc[i,0]
    output_df.loc[i, 'URL'] = url
    print(url, i)
    driver.get(url)
    
    try:
        element = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH, '//div[@class="css-175oi2r r-1adg3ll r-6gpygo"]')))
        
        ## Bio

        bio = driver.find_element(By.XPATH, '//div[@class="css-175oi2r r-1adg3ll r-6gpygo"]')

        ## Following + Followers count

        count = driver.find_element(By.XPATH, '//div[@class="css-175oi2r r-13awgt0 r-18u37iz r-1w6e6rj"]')
        numbers = count.find_elements(By.XPATH, '//span[@class="css-1qaijid r-bcqeeo r-qvutc0 r-poiln3 r-1b43r93 r-1cwl3u0 r-b88u0q"]')
        following, followers = clean_count(numbers[0].text), clean_count(numbers[1].text)
        
        

        ## Location

        loc_line = driver.find_element(By.XPATH, '//div[@class="css-1rynq56 r-bcqeeo r-qvutc0 r-37j5jr r-a023e6 r-16dba41 r-56xrmm"]')
        loc = loc_line.find_element(By.XPATH,'//span[@class="css-1qaijid r-bcqeeo r-qvutc0 r-poiln3 r-4qtqp9 r-1b7u577"]')

        ## website

        try:
            website = WebDriverWait(loc_line,5).until(EC.presence_of_element_located((By.XPATH, '//a[@class="css-1qaijid r-bcqeeo r-qvutc0 r-poiln3 r-4qtqp9 r-1b7u577 r-1loqt21"]')))
        except:
            website = None

        # Adding elements to output dataframe

        output_df.loc[i, 'Bio'] = bio.text
        output_df.loc[i, 'Following count'] = following
        output_df.loc[i, 'Followers count'] = followers
        output_df.loc[i, 'Location'] = loc.text
        output_df.loc[i, 'Website'] = website.text if website is not None else float('nan')
        
    except:
        output_df.iloc[i, 1:] = [float('nan')]*5

https://twitter.com/GTNUK1 0
https://twitter.com/whatsapp 1
https://twitter.com/aacb_CBPTrade 2
https://twitter.com/aacbdotcom 3
https://twitter.com/@AAWindowPRODUCT 4
https://www.twitter.com/aandb_kia 5
https://twitter.com/ABHomeInc 6
https://twitter.com/Abrepro 7
http://www.twitter.com 8
https://twitter.com/ACChristofiLtd 9
https://twitter.com/aeclothing1 10
http://www.twitter.com/ 11
https://twitter.com/AETechnologies1 12
http://www.twitter.com/wix 13
https://twitter.com/AGInsuranceLLC 14


In [31]:
output_df

Unnamed: 0,URL,Bio,Following count,Followers count,Location,Website
0,https://twitter.com/GTNUK1,Providing Entertainment & Travel to Commercial...,460.0,127.0,"London, England",gtn.uk.com/index.php
1,https://twitter.com/whatsapp,"Simple, reliable, private.",3.0,5400000.0,California,
2,https://twitter.com/aacb_CBPTrade,Customs Broker,125.0,32.0,"Florida, USA",
3,https://twitter.com/aacbdotcom,A & A Freight | Warehousing | Customs Brokerag...,3987.0,665.0,Worldwide,aacb.com
4,https://twitter.com/@AAWindowPRODUCT,A commercial glass and glazing company serving...,89.0,80.0,"Malden, MA",aawindowproducts.com
5,https://www.twitter.com/aandb_kia,"A&B Kia is a Kia dealer in Benwood, WV. Stay c...",339.0,296.0,"Benwood, West Virginia",aandbautosales.com
6,https://twitter.com/ABHomeInc,"Industry leader in wholesale home decor, furni...",181.0,364.0,"Rancho Cucamonga, CA",abhomeinc.com
7,https://twitter.com/Abrepro,From large format black & white prints to the ...,125.0,133.0,"Bentonville, AR",abrepro.com
8,http://www.twitter.com,,,,,
9,https://twitter.com/ACChristofiLtd,A & C CHRISTOFI LTD is a fast growing professi...,292.0,89.0,Cyprus,acccyp.com


In [32]:
output_df.insert(0, 'S.No.', list(range(1, output_df.shape[0]+1)))

In [33]:
output_df.to_csv('extracted_twitter.csv', index=False)