# Capstone Project

### Data Cleaning

In [1]:
#Import libraries
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from transformers import pipeline
import country_converter as coco
import geonamescache
import us
import logging

gc = geonamescache.GeonamesCache()

**Read in both #workfromhome and #remotework**

In [2]:
wfh_df_25 = pd.read_csv("../dataset/work_from_home_25.csv")
rw_df_25 = pd.read_csv("../dataset/remote_work_25.csv")

In [3]:
#https://catriscode.com/2021/03/02/extracting-or-removing-mentions-and-hashtags-in-tweets-using-python/
#https://www.debuggex.com/cheatsheet/regex/python
#https://stackoverflow.com/questions/50830214/remove-usernames-from-twitter-data-using-python/50830588
#https://stackoverflow.com/questions/14081050/remove-all-forms-of-urls-from-a-given-string-in-python
def remove_splchar(tweet):
    tweet = tweet.lower() # Lowercases the string
    tweet = re.sub('@[^\s]+', ' ', tweet) # remove usernames
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' ', tweet) #remove URLs
    tweet=re.sub('[^a-zA-Z#]', ' ', tweet) # remove special characters, numbers, punctuations
    tweet = re.sub("#[A-Za-z0-9_]+"," ", tweet) #remove hashtags
    tweet = re.sub(r"\s+", " ", str(tweet)) # replace double spaces with single space
    return tweet

In [4]:
#https://www.geeksforgeeks.org/python-spilt-a-sentence-into-list-of-words/
#https://stackoverflow.com/questions/771918/how-do-i-do-word-stemming-or-lemmatization
def lemmitize(tweet):
    clean_column_list = []
    wnl = WordNetLemmatizer()
    words = tweet.split()
    for word in words:
        clean_column_list.append(wnl.lemmatize(word)) 
    return " ".join(clean_column_list)

In [5]:
#Instantiate sentimnt analysis
sent = pipeline('sentiment-analysis')

All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [6]:
#Get polarity score of each tweet
def getPolarity(tweet):
    polarity_dic = sent(tweet)
    if polarity_dic[0]['label'] == 'POSITIVE':
        return 1
    else:
        return 0

In [7]:
def get_cities_by_country(code):
    cities = []
    for city in gc.get_cities().values():
        if (city['countrycode'] == code) and (len(city['name']) > 3):
            cities.append(city['name'])
    return list(map(lambda x: x.lower(), cities));

In [8]:
def getstates_citiesz_of_usa():
    usa_state_names = [state.name.lower() for state in us.states.STATES_AND_TERRITORIES]
    return get_cities_by_country('US') + usa_state_names;

In [9]:
usa_state_cities_names = getstates_citiesz_of_usa()
indian_cities = get_cities_by_country('IN')
uk_cities = get_cities_by_country('GB')

In [10]:
def checkForUSA(location):
    country = location;
    if location.lower().find('usa') != -1 or location.lower().find('united states of america') != -1 or location.lower().find('united states') != -1:
        country = 'USA'
    elif us.states.lookup(location.split(',')[-1].strip()) != None  or  location.lower().find('america') != -1 or location.lower().find('u.s.a.') != -1:  
        country = 'USA'
    else:    
        for state_city in usa_state_cities_names:
            if location.lower().find(state_city) != -1:
                country = 'USA'
                break;
    return country

In [11]:
def check_for_india(location):
    country = location;
    if location.lower().find('india') != -1:
        country = 'India'
    else:    
        for cities in indian_cities:
            if location.lower().find(cities) != -1:
                country = 'India'
                break;
    return country    

In [12]:
def check_for_uk(location):
    country = location;
    if location.lower().find('uk') != -1 or location.lower().find('britan') != -1 or location.lower().find('united kingdom') != -1:
        country = 'United Kingdom'
    else:    
        for cities in uk_cities:
            if location.lower().find(cities) != -1:
                country = 'United Kingdom'
                break;
    return country    

In [13]:
def check_for_others(location):
    country = location;  
    dicts = gc.get_countries_by_names();
    for country_name,data in dicts.items():
        if location.lower().find(country_name.lower()) != -1:
            return country_name
    return country  

**#workfromhome  dataframe cleaning**

In [14]:
#Drop Unnamed: 0 column
wfh_df_25.drop('Unnamed: 0', axis=1, inplace=True)

In [15]:
#Check to confirm if the column got deleted 
wfh_df_25.head(3)

Unnamed: 0,tweets,user_location,tweet_posted_on,tweet_id
0,"RT @CloudDeskApp: For better or worse, working...","Miami, FL",Mon Jul 26 02:36:22 +0000 2021,1419486724623716355
1,"For better or worse, working from home is here...","Atlanta, GA",Mon Jul 26 02:35:57 +0000 2021,1419486619183116289
2,When it's #monday again but you just can't... ...,Singapore,Mon Jul 26 02:30:13 +0000 2021,1419485177458397189


In [16]:
#Check the shape of the dataframe
wfh_df_25.shape

(7193, 4)

In [17]:
#Check missing values
wfh_df_25.isnull().sum()

tweets             0
user_location      0
tweet_posted_on    0
tweet_id           0
dtype: int64

In [18]:
#Check datatypes
wfh_df_25.dtypes

tweets             object
user_location      object
tweet_posted_on    object
tweet_id            int64
dtype: object

In [19]:
# Change column to be datetime dtype
#https://stackoverflow.com/questions/50503033/remove-minutes-and-hours-from-series
wfh_df_25['tweet_posted_on'] = pd.to_datetime(wfh_df_25['tweet_posted_on'])

In [20]:
wfh_df_25['user_location'].unique()

array(['Miami, FL', 'Atlanta, GA', 'Singapore', ...,
       'Venezuela, Maracaibo', 'Stanley CupVille', 'Milwaukee, WI'],
      dtype=object)

In [21]:
#https://stackoverflow.com/questions/52673285/performance-of-pandas-apply-vs-np-vectorize-to-create-new-column-from-existing-c/52674448#52674448
#Create a new column for processed tweets
wfh_df_25['cleaned_tweets'] = np.vectorize(remove_splchar)(wfh_df_25['tweets'])

In [22]:
#Get list of words which have more than 3 letters
wfh_df_25['cleaned_tweets'] = wfh_df_25['cleaned_tweets'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [23]:
# Place new lists of lemmitized words into the 'cleaned_tweets' column in the dataframe
wfh_df_25['cleaned_tweets'] = wfh_df_25['cleaned_tweets'].map(lemmitize)

In [24]:
#Create a new column 'polarity' to save tweet polarity score
wfh_df_25['polarity'] = wfh_df_25['cleaned_tweets'].map(getPolarity)

In [25]:
#Change column order
wfh_df_25 = wfh_df_25[['tweet_id','tweet_posted_on','tweets','cleaned_tweets','user_location','polarity']]

In [26]:
#Check the dataframe
wfh_df_25.head(3)

Unnamed: 0,tweet_id,tweet_posted_on,tweets,cleaned_tweets,user_location,polarity
0,1419486724623716355,2021-07-26 02:36:22+00:00,"RT @CloudDeskApp: For better or worse, working...",better worse working from home here stay,"Miami, FL",0
1,1419486619183116289,2021-07-26 02:35:57+00:00,"For better or worse, working from home is here...",better worse working from home here stay,"Atlanta, GA",0
2,1419485177458397189,2021-07-26 02:30:13+00:00,When it's #monday again but you just can't... ...,when again just,Singapore,0


**#remotework dataframe cleaning**

In [27]:
#Drop Unnamed: 0 column
rw_df_25.drop('Unnamed: 0', axis=1, inplace=True)

In [28]:
#Check to confirm if the column got deleted 
rw_df_25.head(3)

Unnamed: 0,tweets,user_location,tweet_posted_on,tweet_id
0,RT @dnaRtests: Great speaker! Highly recommend...,Jammu And Kashmir,Mon Jul 26 03:43:00 +0000 2021,1419503491010244610
1,RT @dnaRtests: Great speaker! Highly recommend...,Prague,Mon Jul 26 03:42:58 +0000 2021,1419503485876326402
2,RT @dnaRtests: Great speaker! Highly recommend...,Earth,Mon Jul 26 03:42:57 +0000 2021,1419503482483187713


In [29]:
#Check the shape of the dataframe
rw_df_25.shape

(5124, 4)

In [30]:
#Check missing values
rw_df_25.isnull().sum()

tweets             0
user_location      0
tweet_posted_on    0
tweet_id           0
dtype: int64

In [31]:
#Check datatypes
rw_df_25.dtypes

tweets             object
user_location      object
tweet_posted_on    object
tweet_id            int64
dtype: object

In [32]:
# Change column to be datetime dtype
rw_df_25['tweet_posted_on'] = pd.to_datetime(rw_df_25['tweet_posted_on'])

In [33]:
rw_df_25['user_location'].unique()

array(['Jammu And Kashmir', 'Prague', 'Earth', 'Kenya', 'India',
       'Sri Lanka', 'Grand Blanc, MI', 'Chicago, IL',
       'USA|EUROPE|ASIA|GLOBAL✈️', 'Worldwide', 'Ottawa Canada', 'World',
       'twitter', '🇺🇸 M.4.G.4.🇺🇸  IFB4P 🦅 🏈', 'Berlin, Germany',
       'Waterloo, Ontario Canada', 'Hamburg', 'Sarrbrüken',
       'Fort Lauderdale, FL', 'Melbourne', 'Toronto, Ontario, Canada',
       'United Kingdom UK GB', 'Internet', 'New Jersey', 'Portland, OR',
       'Turn 🔔 Notifications ON ', 'Global', 'Canada CA', 'Manila', 'USA',
       'Remote', 'London | New York', "Town 'n' Country, FL",
       'Atlanta Metro Area', 'Pittsburgh, PA', 'Calgary, Alberta', 'Lsk ',
       'New York City', 'Medellin, Colombia ', 'Westport, CT',
       'London, England', 'anywhere 🌍 / Born in Tokyo🗼', 'NY',
       'West & Central Africa', 'Việt Nam', 'Bay Shore, NY',
       'Newark, Delaware USA', 'Miami, FL', 'Washington, DC', 'maryland',
       'Australia', 'The Zoo, MT', 'W.A.T.P. uk', 'Berlin, Deutsc

In [34]:
#Create a new column for processed tweets
rw_df_25['cleaned_tweets'] = np.vectorize(remove_splchar)(rw_df_25['tweets'])

In [35]:
#Get list of words which have more than 3 letters
rw_df_25['cleaned_tweets'] = rw_df_25['cleaned_tweets'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [36]:
# Place new lists of lemmitized words into the 'cleaned_tweets' column in the dataframe
rw_df_25['cleaned_tweets'] = rw_df_25['cleaned_tweets'].map(lemmitize)

In [37]:
#Create a new column 'polarity' to save tweet polarity score
rw_df_25['polarity'] = rw_df_25['cleaned_tweets'].map(getPolarity)

In [38]:
#Change column order
rw_df_25 = rw_df_25[['tweet_id','tweet_posted_on','tweets','cleaned_tweets','user_location','polarity']]

In [39]:
#Check the dataframe
rw_df_25.head(3)

Unnamed: 0,tweet_id,tweet_posted_on,tweets,cleaned_tweets,user_location,polarity
0,1419503491010244610,2021-07-26 03:43:00+00:00,RT @dnaRtests: Great speaker! Highly recommend...,great speaker highly recommend daysofcode,Jammu And Kashmir,1
1,1419503485876326402,2021-07-26 03:42:58+00:00,RT @dnaRtests: Great speaker! Highly recommend...,great speaker highly recommend daysofcode,Prague,1
2,1419503482483187713,2021-07-26 03:42:57+00:00,RT @dnaRtests: Great speaker! Highly recommend...,great speaker highly recommend daysofcode,Earth,1


### Merge both dataframes

In [87]:
#https://www.geeksforgeeks.org/python-intersection-two-lists/
#Check number of rows that are common in both based on 'tweet_id'
len(set(rw_df_25['tweet_id']).intersection(wfh_df_25['tweet_id']))

45

In [88]:
#https://stackoverflow.com/questions/21317384/pandas-python-how-to-concatenate-two-dataframes-without-duplicates
#https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop_duplicates.html
#Concatenate both datasets
data_19to25 = pd.concat([rw_df_25,wfh_df_25]).drop_duplicates(subset=['tweet_id'], keep='first').reset_index(drop=True)

In [89]:
data_19to25['user_location_cleaned'] = data_19to25['user_location'].map(checkForUSA)

In [90]:
data_19to25['user_location_cleaned'] = data_19to25['user_location_cleaned'].map(check_for_india)

In [91]:
data_19to25['user_location_cleaned'] = data_19to25['user_location_cleaned'].map(check_for_uk)

In [92]:
data_19to25['user_location_cleaned'] = data_19to25['user_location_cleaned'].map(check_for_others)

In [93]:
coco_logger = coco.logging.getLogger()
coco_logger.setLevel(logging.CRITICAL)
data_19to25['user_location_cleaned'] = coco.convert(names=data_19to25['user_location_cleaned'].tolist(), to='ISO2', not_found='None')

In [94]:
#Check the shape of the dataframe
data_19to25.shape

(8177, 7)

In [95]:
data_19to25.head(3)

Unnamed: 0,tweet_id,tweet_posted_on,tweets,cleaned_tweets,user_location,polarity,user_location_cleaned
0,1419503491010244610,2021-07-26 03:43:00+00:00,RT @dnaRtests: Great speaker! Highly recommend...,great speaker highly recommend daysofcode,Jammu And Kashmir,1,IN
1,1419503485876326402,2021-07-26 03:42:58+00:00,RT @dnaRtests: Great speaker! Highly recommend...,great speaker highly recommend daysofcode,Prague,1,
2,1419503482483187713,2021-07-26 03:42:57+00:00,RT @dnaRtests: Great speaker! Highly recommend...,great speaker highly recommend daysofcode,Earth,1,


In [96]:
#Save in a 'dataset' folder with a name 'final_df.csv'
data_19to25.to_csv('../dataset/data_19to25.csv')