# Capstone Project

### Data Cleaning

In [1]:
#Import libraries
import pandas as pd
import numpy as np
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from transformers import pipeline
import country_converter as coco
import geonamescache
import us
import logging
from nltk.sentiment.vader import SentimentIntensityAnalyzer
gc = geonamescache.GeonamesCache()

**Read in both #workfromhome and #remotework**

In [2]:
wfh_df = pd.read_csv("../dataset/tweet_df.csv")
rw_df = pd.read_csv("../dataset/remote_work.csv")
wfh_df_mixed = pd.read_csv("../dataset/wfh_mixed.csv")
rw_df_mixed = pd.read_csv("../dataset/remote_mixed.csv")
wfh_df_25 = pd.read_csv("../dataset/work_from_home_25.csv")
rw_df_25 = pd.read_csv("../dataset/remote_work_25.csv")

In [3]:
twitter_df = pd.concat([rw_df,wfh_df,wfh_df_mixed,rw_df_mixed,wfh_df_25,rw_df_25])

In [4]:
twitter_df.shape

(39534, 5)

In [5]:
len(twitter_df[twitter_df.duplicated("tweet_id")])

17381

In [6]:
twitter_df=twitter_df.drop_duplicates(subset=['tweet_id'], keep='first').reset_index(drop=True)

In [7]:
twitter_df.shape

(22153, 5)

In [8]:
#https://catriscode.com/2021/03/02/extracting-or-removing-mentions-and-hashtags-in-tweets-using-python/
#https://www.debuggex.com/cheatsheet/regex/python
#https://stackoverflow.com/questions/50830214/remove-usernames-from-twitter-data-using-python/50830588
#https://stackoverflow.com/questions/14081050/remove-all-forms-of-urls-from-a-given-string-in-python
def remove_splchar(tweet):
    tweet = tweet.lower() # Lowercases the string
    tweet = re.sub('@[^\s]+', ' ', tweet) # remove usernames
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' ', tweet) #remove URLs
    tweet=re.sub('[^a-zA-Z#]', ' ', tweet) # remove special characters, numbers, punctuations
    tweet = re.sub("#[A-Za-z0-9_]+"," ", tweet) #remove hashtags
    tweet = re.sub(r"\s+", " ", str(tweet)) # replace double spaces with single space
    return tweet.strip()

In [9]:
#https://www.geeksforgeeks.org/python-spilt-a-sentence-into-list-of-words/
#https://stackoverflow.com/questions/771918/how-do-i-do-word-stemming-or-lemmatization
def lemmitize(tweet):
    clean_column_list = []
    wnl = WordNetLemmatizer()
    words = tweet.split()
    for word in words:
        if word not in stopwords.words('english'):
            clean_column_list.append(wnl.lemmatize(word)) 
    return " ".join(clean_column_list)

In [10]:
#Instantiate sentimnt analysis
sent = pipeline('sentiment-analysis')

All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [12]:
#Get polarity score of each tweet
def getPolarity(tweet):
    polarity_dic = sent(tweet)
    if polarity_dic[0]['label'] == 'POSITIVE':
        return 1
    else:
        return 0

In [13]:
def get_cities_by_country(code):
    cities = []
    for city in gc.get_cities().values():
        if (city['countrycode'] == code) and (len(city['name']) > 3):
            cities.append(city['name'])
    return list(map(lambda x: x.lower(), cities));

In [14]:
def getstates_citiesz_of_usa():
    usa_state_names = [state.name.lower() for state in us.states.STATES_AND_TERRITORIES]
    return get_cities_by_country('US') + usa_state_names;

In [15]:
usa_state_cities_names = getstates_citiesz_of_usa()
indian_cities = get_cities_by_country('IN')
uk_cities = get_cities_by_country('GB')

In [16]:
def checkForUSA(location):
    country = location;
    if location.lower().find('usa') != -1 or location.lower().find('united states of america') != -1 or location.lower().find('united states') != -1:
        country = 'USA'
    elif us.states.lookup(location.split(',')[-1].strip()) != None  or  location.lower().find('america') != -1 or location.lower().find('u.s.a.') != -1:  
        country = 'USA'
    else:    
        for state_city in usa_state_cities_names:
            if location.lower().find(state_city) != -1:
                country = 'USA'
                break;
    return country

In [17]:
def check_for_india(location):
    country = location;
    if location.lower().find('india') != -1:
        country = 'India'
    else:    
        for cities in indian_cities:
            if location.lower().find(cities) != -1:
                country = 'India'
                break;
    return country    

In [18]:
def check_for_uk(location):
    country = location;
    if location.lower().find('uk') != -1 or location.lower().find('britan') != -1 or location.lower().find('united kingdom') != -1:
        country = 'United Kingdom'
    else:    
        for cities in uk_cities:
            if location.lower().find(cities) != -1:
                country = 'United Kingdom'
                break;
    return country    

In [19]:
def check_for_others(location):
    country = location;  
    dicts = gc.get_countries_by_names();
    for country_name,data in dicts.items():
        if location.lower().find(country_name.lower()) != -1:
            return country_name
    return country  

In [20]:
def check_for_none(location):
    country = location;
    if location == 'Europe' or location== 'Berlin, Deutschland' or location=='Sarrbr√ºken' or location=='Berlin'or location=='Prague'or location=='Bucharest' or location=="Deutschland" :
        country = 'DE'
    elif location== 'North West, England' or location=='England' or location=='South West, England'or location=='South East, England'or location=='Scotland'or location=='New England'or location=='North East, England'or location=='West Sussex - 01293 300 020':
        country = 'GB'
    elif location=='Rio de janeiro'or location=='S√£o Paulo, Brasil':
        country = 'BR'
    elif location== 'Toronto'or location== 'Edmonton, Alberta' or location=='Toronto, ON' or location=='Fort St. John'or location=='Montr√©al, Qu√©bec'or location=='Montr√©al, Qc':
        country = 'CA'
    elif location== 'Sydney, New South Wales' or location=='Carnegie, VIC, AU, 3163' or location=='LON | NYC | HKG  | SYD | SIN' or location=='Sydney':
        country = 'AU'
    elif location== 'NYC' or location== 'Silicon Valley & beyond' or location=='üá∫üá∏ M.4.G.4.üá∫üá∏  IFB4P ü¶Ö üèà' or location=='monkeytown' or location=='East Coast'or location=='Morgan City, LA 70380'or location=='Mid-Atlantic'or location=='14 locations in US. Cincy HQ.'or location=='MA: 40.758348,-73.986972'or location=='SF Bay Area'or location=='Appalachia':
        country = 'US'
    elif location== 'Iloilo City, Western Visayas' or location== 'Manila' or location=='Poblacion, Talibon, Bohol' or location=='Phlippines':
        country = 'PH'
    elif location== 'Bhopal Madhya Pradesh' or location== 'Thane'or location=='Kolkata, West Bengal' or location=='Chandigarh' or location=='‡¥ï‡µá‡¥∞‡¥≥‡¥Ç'or location=='Kolkata'or location=='Haryana':
        country = 'IN'
    elif location== 'G√∂teborg, Sverige':
        country = 'SE'
    elif location=='Ankara' or location== 'Istanbul'or location== 'Adana'or location=='Eski≈üehir, T√ºrkiye'or location=='Izmir':
        country = 'TR'
    elif location== 'M√©xico':
        country = 'MX'
    elif location=='Tokyo' or location=='anywhere üåç / Born in Tokyoüóº':
        country = 'JP'
    elif location=='Madrid'or location=='Barcelona, Espa√±a'or location=='Espa√±a'or location=='Madrid, Comunidad de Madrid':
        country = 'ES'
    elif location=='Kampala':
        country = 'UG'
    elif location== 'Buenos Aires':
        country = 'AR'
    elif location=='UAE'or location=='Dubai, UAE':
        country = 'AE'
    elif location=='Bali, Indonesien' or location=='NYC - LON - SIN - BALI':
        country = 'ID'
    elif location== 'Brunssum':
        country = 'NL'
    elif location== 'Bloemfontein, ZA'or location=='Cape Town'or location=='137 Hennie Alberts Brackenhurs':
        country = 'ZA'
    elif location== 'Caracas':
        country = 'VE'
    elif location== 'Z√ºrich, Schweiz':
        country = 'CH'
    elif location== 'Harare':
        country = 'ZW'
    elif location== 'Jhelum':
        country = 'PK'
    elif location== 'Nairobi':
        country = 'KE'
    return country  

**#workfromhome  dataframe cleaning**

In [21]:
#Drop Unnamed: 0 column
twitter_df.drop('Unnamed: 0', axis=1, inplace=True)

In [22]:
#Check to confirm if the column got deleted 
twitter_df.head(3)

Unnamed: 0,tweets,user_location,tweet_posted_on,tweet_id
0,üè° #RemoteWork is on the rise &amp; slowly beco...,"New York, NY",Fri Jul 16 23:43:03 +0000 2021,1416181616846811137
1,Opportunity to join a fantastic team at a hi-t...,London | New York,Fri Jul 16 23:42:07 +0000 2021,1416181380279635970
2,Good news for #JobSeekers open to #RemoteWork!...,Sydney | Hong Kong | Singapore,Fri Jul 16 23:41:30 +0000 2021,1416181225979473920


In [23]:
#Check the shape of the dataframe
twitter_df.shape

(22153, 4)

In [24]:
#Check missing values
twitter_df.isnull().sum()

tweets             0
user_location      1
tweet_posted_on    0
tweet_id           0
dtype: int64

In [25]:
#Check datatypes
twitter_df.dtypes

tweets             object
user_location      object
tweet_posted_on    object
tweet_id            int64
dtype: object

In [26]:
# Change column to be datetime dtype
#https://stackoverflow.com/questions/50503033/remove-minutes-and-hours-from-series
twitter_df['tweet_posted_on'] = pd.to_datetime(twitter_df['tweet_posted_on'])

In [27]:
#https://stackoverflow.com/questions/52673285/performance-of-pandas-apply-vs-np-vectorize-to-create-new-column-from-existing-c/52674448#52674448
#Create a new column for processed tweets
twitter_df['cleaned_tweets'] = np.vectorize(remove_splchar)(twitter_df['tweets'])

In [28]:
#Get list of words which have more than 3 letters
twitter_df['cleaned_tweets'] = twitter_df['cleaned_tweets'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [29]:
# Place new lists of lemmitized words into the 'cleaned_tweets' column in the dataframe
twitter_df['cleaned_tweets'] = twitter_df['cleaned_tweets'].map(lemmitize)

In [30]:
#Create a new column 'polarity' to save tweet polarity score
twitter_df['polarity'] = twitter_df['cleaned_tweets'].map(getPolarity)

In [31]:
twitter_df['user_location_cleaned'] = twitter_df['user_location'].astype(str).map(checkForUSA)

In [32]:
twitter_df['user_location_cleaned'] = twitter_df['user_location_cleaned'].astype(str).map(check_for_india)

In [33]:
twitter_df['user_location_cleaned'] = twitter_df['user_location_cleaned'].astype(str).map(check_for_uk)

In [34]:
twitter_df['user_location_cleaned'] = twitter_df['user_location_cleaned'].astype(str).map(check_for_others)

In [35]:
twitter_df['user_location_cleaned'] = twitter_df['user_location_cleaned'].astype(str).map(check_for_none)

In [36]:
coco_logger = coco.logging.getLogger()
coco_logger.setLevel(logging.CRITICAL) #Print only the critical loggers
twitter_df['user_location_cleaned'] = coco.convert(names=twitter_df['user_location_cleaned'].tolist(), to='ISO2', not_found='None')

In [37]:
#Check the shape of the dataframe
twitter_df.shape

(22153, 7)

In [38]:
#Change column order
twitter_df = twitter_df[['tweet_id','tweet_posted_on','tweets','cleaned_tweets','user_location','user_location_cleaned','polarity']]

In [39]:
#Check the dataframe
twitter_df.head(3)

Unnamed: 0,tweet_id,tweet_posted_on,tweets,cleaned_tweets,user_location,user_location_cleaned,polarity
0,1416181616846811137,2021-07-16 23:43:03+00:00,üè° #RemoteWork is on the rise &amp; slowly beco...,rise slowly becoming normal mean need remote t...,"New York, NY",US,1
1,1416181380279635970,2021-07-16 23:42:07+00:00,Opportunity to join a fantastic team at a hi-t...,opportunity join fantastic team tech fast pace...,London | New York,US,1
2,1416181225979473920,2021-07-16 23:41:30+00:00,Good news for #JobSeekers open to #RemoteWork!...,good news open excellent report remote advert ...,Sydney | Hong Kong | Singapore,HK,1


In [40]:
#Save in a 'dataset' folder with a name 'twitter_df.csv'
twitter_df.to_csv('../dataset/twitter_df.csv',index=False)