In [19]:
import spacy
import pandas as pd

tweets_df = pd.read_csv("SriLankaTweets.csv")

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")


# Function to extract place names from a text using NER
def extract_place_names(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == "GPE"]


# Apply the function to tweets
tweets_df['place_names'] = tweets_df['tweet'].apply(extract_place_names)


In [20]:
tweets_df['place_names']

0                                         [Sri Lanka]
1                                         [Sri Lanka]
2                                                  []
3                               [Sri lanka, Red Zone]
4        [Sri Lanka's, Sri Lanka’s, Ukraine, Germany]
                             ...                     
9999                                      [Sri Lanka]
10000                                              []
10001                     [Sri Lanka, skal, SriLanka]
10002                                              []
10003                                              []
Name: place_names, Length: 10004, dtype: object

In [21]:
tweets_df = tweets_df[tweets_df['place_names'].str.len() > 0]


In [22]:
tweets_df

Unnamed: 0.1,Unnamed: 0,id,conversation_id,created_at,date,timezone,place,tweet,language,hashtags,...,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,place_names
0,0,1546235784730230785,1546089912042127362,1.657486e+12,2022-07-10 20:51:53,0,,@MrJonasDanner Das geht doch seit Beginn der B...,de,[],...,,,,,"[{'screen_name': 'MrJonasDanner', 'name': 'Jon...",,,,,[Sri Lanka]
1,1,1546235779906781186,1546235779906781186,1.657486e+12,2022-07-10 20:51:52,0,,Artículo lúcido y bien informado sobre la cris...,es,[],...,,,,,[],,,,,[Sri Lanka]
3,3,1546235770582847496,1546185673593524225,1.657486e+12,2022-07-10 20:51:50,0,,@Haqeeqat_TV Ab na daro aur sikho in Sri lank...,hi,[],...,,,,,"[{'screen_name': 'Haqeeqat_TV', 'name': 'Haqee...",,,,,"[Sri lanka, Red Zone]"
4,4,1546235754342498308,1546235754342498308,1.657486e+12,2022-07-10 20:51:46,0,,Resigned or Arrested? 1. Sri Lanka's presiden...,et,[],...,,,,,[],,,,,"[Sri Lanka's, Sri Lanka’s, Ukraine, Germany]"
5,5,1546235750446170113,1546235750446170113,1.657486e+12,2022-07-10 20:51:45,0,,Sri Lanka protesters vow to occupy presidentia...,en,[],...,,,,,[],,,,,[Sri Lanka]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9990,9990,1546176384640360449,1546176384640360449,1.657472e+12,2022-07-10 16:55:51,0,,Ce qui se passe au Sri Lanka est incroyable !,fr,[],...,,,,,[],,,,,[Sri Lanka]
9994,9994,1546176347424567297,1546176347424567297,1.657472e+12,2022-07-10 16:55:42,0,,"""The United States, together with many other m...",en,['srilankaeconomiccrisis'],...,,,,,[],,,,,"[The United States, Sri Lanka, US]"
9998,9998,1546176327912493056,1546176327912493056,1.657472e+12,2022-07-10 16:55:37,0,,De Haagse pluce heeft niet door hetgeen er zic...,nl,[],...,,,,,[],,,,,[Sri Lanka]
9999,9999,1546176323789504512,1546176323789504512,1.657472e+12,2022-07-10 16:55:36,0,,El suicidio económico de la agricultura orgáni...,es,[],...,,,,,[],,,,,[Sri Lanka]


In [23]:
# List of columns to keep
columns_to_keep = ['tweet', 'created_at', 'place_names', 'hashtags']
tweets_df = tweets_df[columns_to_keep]

In [24]:
tweets_df

Unnamed: 0,tweet,created_at,place_names,hashtags
0,@MrJonasDanner Das geht doch seit Beginn der B...,1.657486e+12,[Sri Lanka],[]
1,Artículo lúcido y bien informado sobre la cris...,1.657486e+12,[Sri Lanka],[]
3,@Haqeeqat_TV Ab na daro aur sikho in Sri lank...,1.657486e+12,"[Sri lanka, Red Zone]",[]
4,Resigned or Arrested? 1. Sri Lanka's presiden...,1.657486e+12,"[Sri Lanka's, Sri Lanka’s, Ukraine, Germany]",[]
5,Sri Lanka protesters vow to occupy presidentia...,1.657486e+12,[Sri Lanka],[]
...,...,...,...,...
9990,Ce qui se passe au Sri Lanka est incroyable !,1.657472e+12,[Sri Lanka],[]
9994,"""The United States, together with many other m...",1.657472e+12,"[The United States, Sri Lanka, US]",['srilankaeconomiccrisis']
9998,De Haagse pluce heeft niet door hetgeen er zic...,1.657472e+12,[Sri Lanka],[]
9999,El suicidio económico de la agricultura orgáni...,1.657472e+12,[Sri Lanka],[]


In [25]:
# Flatten the list of place names
all_place_names = [place for sublist in tweets_df['place_names'] for place in sublist]

# Remove duplicates and create a new DataFrame
unique_place_names_df = pd.DataFrame({'unique_place_names': list(set(all_place_names))})

# Display the new DataFrame
print(unique_place_names_df)

              unique_place_names
0                        बार बार
1                     Our County
2                          Capaz
3                      Singapore
4                       Honduras
...                          ...
2158                  Nueva York
2159                      Crisis
2160  El presidente de Sri Lanka
2161                    Gobierno
2162               los ganaderos

[2163 rows x 1 columns]


In [26]:
import re

# Function to check if a place name is an @mention
def is_mention(place_name):
    return re.match(r'@\w+', place_name) is not None

# Filter out @mentions
unique_place_names_df = unique_place_names_df[~unique_place_names_df['unique_place_names'].apply(is_mention)]

# Function to check if a place name contains a URL
def contains_url(place_name):
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    return re.search(url_pattern, place_name) is not None

# Filter out entries with URLs
unique_place_names_df = unique_place_names_df[~unique_place_names_df['unique_place_names'].apply(contains_url)]

emoji_pattern = re.compile(
    "["
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F1E0-\U0001F1FF"  # flags (iOS)
    "\U00002702-\U000027B0"
    "\U000024C2-\U0001F251"
    "\U0001f926-\U0001f937"
    "\u200d"
    "\u2640-\u2642"
    "\U0001F1F2-\U0001F1F4"  # Macau flag
    "\U0001F1E6-\U0001F1FF"  # flags
    "\U0001F600-\U0001F64F"
    "\U0001F680-\U0001F6FF"
    "\u2600-\u26FF"          # Miscellaneous Symbols
    "\u2700-\u27BF"          # Dingbats
    "\U0001F1E0-\U0001F1FF"  # Flags
    "]+", flags=re.UNICODE)

# Function to remove emojis from a string
def remove_emojis(text):
    return emoji_pattern.sub(r'', text)

# Apply the function to each place name
unique_place_names_df['unique_place_names'] = unique_place_names_df['unique_place_names'].apply(remove_emojis)

In [27]:
import pandas as pd
import pycountry
from fuzzywuzzy import process

# Create a list of all country names
country_names = [country.name for country in pycountry.countries]

# Function to find the closest country name
def find_closest_country(place_name):
    closest_country, _ = process.extractOne(place_name, country_names)
    return closest_country

# Apply the function to each place name
unique_place_names_df['identified_country'] = unique_place_names_df['unique_place_names'].apply(find_closest_country)

# Display the updated DataFrame
print(unique_place_names_df.head())



  unique_place_names                     identified_country
0            बार बार  Congo, The Democratic Republic of the
1         Our County                                  Congo
2              Capaz                                  Japan
3          Singapore                              Singapore
4           Honduras                               Honduras


In [28]:
world_cities_df = pd.read_csv("worldcities.csv")

# Create a combined city-country column for easier matching
world_cities_df['city_country'] = world_cities_df['city'] + ', ' + world_cities_df['country']
world_cities_df['country_only'] = world_cities_df['country']

# Function to find coordinates for a given place name
def find_coordinates(place_name):
    # Try matching with city-country combination
    match = world_cities_df[world_cities_df['city_country'] == place_name]
    if not match.empty:
        return match.iloc[0]['lat'], match.iloc[0]['lng']
    
    # Try matching with country only
    match = world_cities_df[world_cities_df['country_only'] == place_name]
    if not match.empty:
        return match.iloc[0]['lat'], match.iloc[0]['lng']

    # No match found
    return None, None

# Apply the function to each place name in 'unique_place_names_df'
unique_place_names_df['latitude'], unique_place_names_df['longitude'] = zip(*unique_place_names_df['unique_place_names'].apply(find_coordinates))

# Display the updated DataFrame
print(unique_place_names_df)

              unique_place_names                     identified_country  \
0                        बार बार  Congo, The Democratic Republic of the   
1                     Our County                                  Congo   
2                          Capaz                                  Japan   
3                      Singapore                              Singapore   
4                       Honduras                               Honduras   
...                          ...                                    ...   
2158                  Nueva York                                   Niue   
2159                      Crisis                       Christmas Island   
2160  El presidente de Sri Lanka                              Sri Lanka   
2161                    Gobierno                                  Benin   
2162               los ganaderos                                  Ghana   

      latitude  longitude  
0          NaN        NaN  
1          NaN        NaN  
2          NaN 

In [29]:
# for the countries that do not have latitude and longitude, replace their unique_place_names value with the value of identified country
# Replace unique_place_names with identified_country where coordinates are missing
mask = unique_place_names_df['latitude'].isna() & unique_place_names_df['longitude'].isna()
unique_place_names_df.loc[mask, 'unique_place_names'] = unique_place_names_df.loc[mask, 'identified_country']

In [31]:
unique_place_names_df.to_csv("check.csv",index=False)

In [32]:
world_cities_df = pd.read_csv("worldcities.csv")

# Create a combined city-country column for easier matching
world_cities_df['city_country'] = world_cities_df['city'] + ', ' + world_cities_df['country']
world_cities_df['country_only'] = world_cities_df['country']

# Function to find coordinates for a given place name
def find_coordinates(place_name):
    # Try matching with city-country combination
    match = world_cities_df[world_cities_df['city_country'] == place_name]
    if not match.empty:
        return match.iloc[0]['lat'], match.iloc[0]['lng']
    
    # Try matching with country only
    match = world_cities_df[world_cities_df['country_only'] == place_name]
    if not match.empty:
        return match.iloc[0]['lat'], match.iloc[0]['lng']

    # No match found
    return None, None

# Apply the function to each place name in 'unique_place_names_df'
unique_place_names_df['latitude'], unique_place_names_df['longitude'] = zip(*unique_place_names_df['unique_place_names'].apply(find_coordinates))

# Display the updated DataFrame
print(unique_place_names_df)

                         unique_place_names  \
0     Congo, The Democratic Republic of the   
1                                     Congo   
2                                     Japan   
3                                 Singapore   
4                                  Honduras   
...                                     ...   
2158                                   Niue   
2159                       Christmas Island   
2160                              Sri Lanka   
2161                                  Benin   
2162                                  Ghana   

                         identified_country  latitude  longitude  
0     Congo, The Democratic Republic of the       NaN        NaN  
1                                     Congo       NaN        NaN  
2                                     Japan   35.6897   139.6922  
3                                 Singapore    1.3000   103.8000  
4                                  Honduras   14.1000   -87.2167  
...                              

In [33]:
unique_place_names_df.to_csv("check.csv",index=False)

In [35]:
unique_place_names_df

Unnamed: 0,unique_place_names,identified_country,latitude,longitude
0,"Congo, The Democratic Republic of the","Congo, The Democratic Republic of the",,
1,Congo,Congo,,
2,Japan,Japan,35.6897,139.6922
3,Singapore,Singapore,1.3000,103.8000
4,Honduras,Honduras,14.1000,-87.2167
...,...,...,...,...
2158,Niue,Niue,-19.0560,-169.9210
2159,Christmas Island,Christmas Island,-10.4167,105.7167
2160,Sri Lanka,Sri Lanka,6.9344,79.8428
2161,Benin,Benin,6.3667,2.4333


In [36]:
# Step 1: Create a mapping from place names to coordinates
place_to_coords = {
    row['unique_place_names']: (row['latitude'], row['longitude'])
    for index, row in unique_place_names_df.iterrows()
}

# Step 2: Assign coordinates to each tweet
def assign_coords(place_names):
    # Find the first place name in the list with available coordinates
    for name in place_names:
        if name in place_to_coords and place_to_coords[name] != (None, None):
            return place_to_coords[name]
    return None, None

# Apply the function to the 'place_names' column of tweets_df
coords = tweets_df['place_names'].apply(assign_coords)
tweets_df['latitude'], tweets_df['longitude'] = zip(*coords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_df['latitude'], tweets_df['longitude'] = zip(*coords)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_df['latitude'], tweets_df['longitude'] = zip(*coords)


In [37]:
tweets_df

Unnamed: 0,tweet,created_at,place_names,hashtags,latitude,longitude
0,@MrJonasDanner Das geht doch seit Beginn der B...,1.657486e+12,[Sri Lanka],[],6.9344,79.8428
1,Artículo lúcido y bien informado sobre la cris...,1.657486e+12,[Sri Lanka],[],6.9344,79.8428
3,@Haqeeqat_TV Ab na daro aur sikho in Sri lank...,1.657486e+12,"[Sri lanka, Red Zone]",[],,
4,Resigned or Arrested? 1. Sri Lanka's presiden...,1.657486e+12,"[Sri Lanka's, Sri Lanka’s, Ukraine, Germany]",[],50.4500,30.5233
5,Sri Lanka protesters vow to occupy presidentia...,1.657486e+12,[Sri Lanka],[],6.9344,79.8428
...,...,...,...,...,...,...
9990,Ce qui se passe au Sri Lanka est incroyable !,1.657472e+12,[Sri Lanka],[],6.9344,79.8428
9994,"""The United States, together with many other m...",1.657472e+12,"[The United States, Sri Lanka, US]",['srilankaeconomiccrisis'],6.9344,79.8428
9998,De Haagse pluce heeft niet door hetgeen er zic...,1.657472e+12,[Sri Lanka],[],6.9344,79.8428
9999,El suicidio económico de la agricultura orgáni...,1.657472e+12,[Sri Lanka],[],6.9344,79.8428
