# Data Analysis
This notebook is to get insights on the data we've extracted using Selenium and PhantomBuster. We're looking to extract cities and countries mentioned by name or flag emoji in the bios of all the followers (that have a bio).

In [8]:
import pandas as pd
from geotext import GeoText
import emoji
from collections import Counter
import pycountry

In [9]:
def get_flag_emoji_dict():
    flag_emoji_dict = {}
    for country in pycountry.countries:
        country_code = country.alpha_2.lower()
        flag_emoji = emoji.emojize(f":flag_{country_code}:", language="alias")
        flag_emoji_dict[flag_emoji] = country.name
    return flag_emoji_dict

flag_emoji_dict = get_flag_emoji_dict()

In [15]:
# Load the dataset using pandas
data = pd.read_csv('../augmented_phantom_files/result_with_bios.csv')

# Remove rows with NaN
data_no_nan = data.dropna(subset=['bio'])

# Create a Counter object to count countries and cities
country_counter = Counter()
city_counter = Counter()

# Get the emoji dictionary
emoji_dict = get_flag_emoji_dict()

# Iterate through the dataset
for _, row in data_no_nan.iterrows():
    bio = row['bio']
    
    # Extract flag emojis and add to the country counter
    for char in bio:
        
        if char in emoji_dict:
            emoji_alias = emoji_dict[char].lower()
            
            if 'flag' in emoji_alias:
                country_code = emoji_alias.replace('flag_for_', '').replace('_', ' ').strip()
                country_counter.update([country_code])

    # Extract cities and countries using GeoText
    places = GeoText(bio)
    city_counter.update(places.cities)
    country_counter.update(places.countries)

# Print the results
print("Countries:")
for country, count in country_counter.most_common():
    print(f"{country}: {count}")

print("\nCities:")
for city, count in city_counter.most_common():
    print(f"{city}: {count}")
    
total_city_count = len(city_counter)
total_country_count = len(country_counter)

print("\nTotal cities:", total_city_count)
print("Total countries:", total_country_count)

Countries:
Egypt: 5
South Africa: 1
France: 1

Cities:
Edinburgh: 16
University: 6
London: 6
Glasgow: 3
Alexandria: 3
Paris: 3
Berlin: 2
Madrid: 1
March: 1
Stuttgart: 1
Young: 1
Dublin: 1
Rutherglen: 1
Los Angeles: 1
Cape Town: 1
Paulista: 1
Grasse: 1
Mon: 1
Augsburg: 1
Birmingham: 1
Pop: 1
New York: 1
Un: 1
Nottingham: 1
Manchester: 1
Jeddah: 1
Boston: 1
Budapest: 1
Musselburgh: 1
Leeds: 1
Brighton: 1
Doha: 1
Cairo: 1
Southampton: 1
Bunda: 1
Sheffield: 1
Lyon: 1
Leicester: 1
Newcastle: 1
Washington: 1

Total cities: 40
Total countries: 3
