# Data Analysis
This notebook is to get insights on the data we've extracted using Selenium and PhantomBuster.

In [25]:
import spacy
import re
from collections import Counter
import pandas as pd

In [14]:
# Load dataframe
df = pd.read_csv("../augmented_phantom_files/result_with_bios.csv")
df

Unnamed: 0,id,username,fullName,bio,profileUrl,isPrivate,isVerified
0,57168642439,youssef_officiel_91,,♥️Solo adoramos a Dios no al hombre💙tiznit cit...,https://www.instagram.com/youssef_officiel_91,False,False
1,1163160760,sophiso_gg,Sofia,Edinburgh📍Madrid📍,https://www.instagram.com/sophiso_gg,True,False
2,5930792775,spyros_vlachos11,Spyros Vlachos,,https://www.instagram.com/spyros_vlachos11,True,False
3,4510988097,ross_patterson_,Ross Patterson,On a mission to find and spread good ideas.,https://www.instagram.com/ross_patterson_,False,False
4,548497561,danielmartinez_ofc,Daniel,Family man & Techno artist\n🎶 @bashtechrecords...,https://www.instagram.com/danielmartinez_ofc,False,False
...,...,...,...,...,...,...,...
426,642860046,faroukelgarem,Farouk ElGarem,,https://www.instagram.com/faroukelgarem,True,False
427,364585004,salmaelgarem,Salma El Garem,,https://www.instagram.com/salmaelgarem,True,False
428,362849049,youssef_ragab,Youssef Ragab,,https://www.instagram.com/youssef_ragab,True,False
429,286153319,angiedeparis,Angeline De Paris,,https://www.instagram.com/angiedeparis,True,False


In [6]:
# Count the number of rows with NaN in column A
nan_count = df['bio'].isna().sum()
print(f"Number of NaN in column A: {nan_count}")

Number of NaN in column A: 154


In [15]:
# Preprocess the bio colummn (remove line breaks)
df['bio'] = df['bio'].replace(r'\n', ' ', regex=True)

In [18]:
for bio in df['bio'].values.tolist():
    print(bio)

♥️Solo adoramos a Dios no al hombre💙tiznit citiy🇲🇦♓ 🤲No hay más dios que Tú, yo era de los malhechores🤲
Edinburgh📍Madrid📍
nan
On a mission to find and spread good ideas.
Family man & Techno artist 🎶 @bashtechrecords 🪩 @kenopsicevents 👕 @socalkustomsp
🅱️itcoin🅰️ccount♏anager📈📉 Expert trading 💯💯 💎 Cryptocurrency ⏲ 24/7 trading system 📞 DM in earning profit 👉 100% real and legit and profit guaranteed💯
Records, DJ's, Beer & Analogue Gear ❤️ - All My True Vices.
nan
Edinburgh based collective running live music events in the capital #TheFishBar Next event - Friday 24th March @ the caves
nan
cairo • california || @kesemucirvine
Making gravity feel like pressure since 2012 🤎, A&E nurse, old cat lady, 🇮🇹+🇨🇭 in 🏴󠁧󠁢󠁳󠁣󠁴󠁿. Owner & amateur photographer @roningrappling
🇪🇬🇫🇷🇺🇸  @evogennutrition
Strobe lights and blown speakers Stuttgart, 2004
Dj/Live Techno/Tekno/DnB @eggslutclub /6tmAnon/ @pulse_edinburgh_ www.facebook.com/foxtrottekno www.soundcloud.com/foxtrottekno
nan
https://ra.co/events/1619531

## NLP

In [26]:
def extract_locations_from_text(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == "GPE"]

def extract_locations_from_emojis(text):
    # A dictionary with some common location-related emojis and their corresponding location names
     
    emoji_location_mapping = {
        '🗽': 'New York',
        '🏙️': 'City',
        '🏔️': 'Mountains',
        '🌉': 'San Francisco',
        '🇺🇸': 'United States',
        # Add more emojis and locations here
    }
    
    locations = []
    for emoji, location in emoji_location_mapping.items():
        if emoji in text:
            locations.append(location)
    return locations

In [28]:
nlp = spacy.load("en_core_web_sm")

location_counts = Counter()

for bio in df['bio'].values.tolist():
    locations = extract_locations_from_text(str(bio))
#     locations += extract_locations_from_emojis(bio)
    location_counts.update(locations)

print(location_counts)

Counter({'🇪': 17, 'Egypt': 4, 'Edinburgh': 4, 'London': 3, 'UK': 2, 'Berlin': 2, 'Scotland': 2, '🏡': 2, 'Strobe': 1, 'Stuttgart': 1, 'Canonmills': 1, '@ellmontgomery_xo': 1, 'Alexandria': 1, 'Texas': 1, 'Cardiff County FC Player': 1, 'Rutherglen': 1, 'Los Angeles': 1, 'South Africa': 1, 'Parceria': 1, 'the Path Quotes': 1, 'Grasse': 1, 'France': 1, 'Augsburg': 1, 'München': 1, 'egypt': 1, 'dallas': 1, 'Reddit': 1, 'England': 1, 'AsRoma': 1, 'Berlijn': 1, 'Jeddah': 1, '📍Budapest/Edinburgh🏴': 1, 'cairo': 1, 'boston': 1, 'casablanca': 1, 'Objekt': 1, 'doha': 1, 'Paris': 1, '𝓑𝓮𝓬𝓪𝓾𝓼𝓮': 1, 'Brighton': 1, 'Doha': 1})
