In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [None]:
import json
import emoji
from datetime import datetime, timedelta
from timezonefinder import TimezoneFinder
import pytz
import re
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import ssl
import certifi
import geopy.geocoders
from time import sleep
from nltk import bigrams

import pyspark

ctx = ssl.create_default_context(cafile=certifi.where())
geopy.geocoders.options.default_ssl_context = ctx


class FilteredTweet:  # Class to store the clean Tweet data.
    counter = 0

    def __init__(self, tweet_dict):
        self.index = None
        localtime = parseDate(tweet_dict)
        self.tweet_weekday = localtime.weekday()
        self.tweet_day = localtime.day
        self.tweet_hour = localtime.hour
        self.orig_text = tweet_dict['text']
        self.filtered_text = parseTweetText(tweet_dict)

        self.user_ID = tweet_dict['user']['id']
        self.user_verified = tweet_dict['user']['verified']
        self.user_followers_count = tweet_dict['user']['followers_count']
        self.user_friends_count = tweet_dict['user']['friends_count']
        self.user_listed_count = tweet_dict['user']['listed_count']
        self.user_statuses_count = tweet_dict['user']['statuses_count']

        user_creation_time = datetime.strptime(tweet_dict['user']['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
        self.user_creation_year = user_creation_time.year
        self.user_creation_month = user_creation_time.month
        self.user_creation_day = user_creation_time.day

        self.place_name = tweet_dict['place']['name']

        city_and_province = tweet_dict['place']['full_name'].split(",")
        if len(city_and_province) > 1:
            self.place_province = city_and_province[1].strip()
        else:
            self.place_province = None

        self.place_country = tweet_dict['place']['country']
        self.place_longitude = float(tweet_dict['place']['bounding_box']['coordinates'][0][0][0])
        self.place_latitude = float(tweet_dict['place']['bounding_box']['coordinates'][0][0][1])

        list_hashtags_dict = tweet_dict['entities']['hashtags']
        if len(list_hashtags_dict) > 0:
            list_hashtags = list()
            for hashtag_dict in list_hashtags_dict:
                text = hashtag_dict['text']
                text = text.lower()
                text = re.sub(r'[^a-z\s]', '', text)
                list_hashtags.append(text)
            self.hashtags = ','.join(list_hashtags)
        else:
            self.hashtags = None

    def __str__(self):  # Represent the tweet as a dictionary.
        return str(self.__dict__)

    def setProvince(self, recursion=0):
        # Method to set the province using geopy library and latitude/longitude if it does not exist from twitter data.
        # Will run recursively 10 times as geolocator may time out.
        geolocator = Nominatim(user_agent="my_application")
        try:
            location = geolocator.reverse(str(self.place_latitude) + "," + str(self.place_longitude), timeout=geopy.geocoders.base.DEFAULT_SENTINEL)
            if 'address' in location.raw.keys():
                if 'state' in location.raw['address'].keys():
                    self.place_province = location.raw['address']['state']
                    sleep(1)
        except GeocoderTimedOut as e:
            if recursion > 10:
                return
            else:
                sleep(1)
                self.setProvince(recursion=recursion + 1)

    def increment(
            self):  # Method will increment the counter and set the index.  Called when new clean tweet is written.
        FilteredTweet.counter = FilteredTweet.counter + 1
        self.index = FilteredTweet.counter


def filterCountry(tweet_dict, country):  # Filter tweet object based on country.  Takes arguments of a standard tweet
    # dictionary and country (as string).
    if 'place' in tweet_dict.keys():
        if tweet_dict['place'] is not None:
            if 'country' in tweet_dict['place'].keys():
                if tweet_dict['place']['country'] == country:
                    return True
                else:
                    return False
            else:
                return False
        else:
            return False
    else:
        return False


def filterTrackable(tweet_dict):  # Filter tweet based on whether it is trackable or not.
    if parseTimeZone(tweet_dict) is not None:
        return True
    else:
        return False


def filterLanguage(tweet_dict, language):  # Filter tweet object based on language.
    if 'lang' in tweet_dict.keys():
        if tweet_dict['lang'] is not None:
            if tweet_dict['lang'] == language:
                return True
            else:
                return False
        else:
            return False
    else:
        return False


def filterOriginal(tweet_dict):  # Method to filter out tweets that are quotes or re-tweets.
    if 'is_quote_status' in tweet_dict.keys():
        if tweet_dict['is_quote_status']:
            return False
    if tweet_dict['retweeted']:
        return False
    else:
        return True


def parseDate(tweet_dict):  # Method to parse the date/time of a tweet into the tweeter's local time.
    date_info = tweet_dict['created_at']
    orig_date = datetime.strptime(date_info, '%a %b %d %H:%M:%S +0000 %Y')
    fmt = '%a, %b %d %Y %H:%M:%S'
    new_date = datetime.strptime(datetime.strftime(orig_date, fmt), fmt)
    tz = parseTimeZone(tweet_dict)
    localized_time = new_date.astimezone(pytz.timezone(tz))
    offset = int(str(localized_time)[-6:-3])
    delta = timedelta(hours=offset)
    localized_time = new_date + delta
    return localized_time


def parseTimeZone(tweet_dict):  # Method to retrieve the timezone based on the user coordinates.
    tf = TimezoneFinder()
    if 'place' in tweet_dict.keys():
        if tweet_dict['place'] is not None:
            if 'bounding_box' in tweet_dict['place'].keys():
                if 'coordinates' in tweet_dict['place']['bounding_box'].keys():
                    if len(tweet_dict['place']['bounding_box']['coordinates']) > 0:
                        longitude = float(tweet_dict['place']['bounding_box']['coordinates'][0][0][0])
                        latitude = float(tweet_dict['place']['bounding_box']['coordinates'][0][0][1])
                        timezone = tf.timezone_at(lat=latitude, lng=longitude)
                        if timezone is None:
                            timezone = tf.closest_timezone_at(lat=latitude, lng=longitude)
                        return timezone
                    else:
                        return None
                else:
                    return None
            else:
                return None
        else:
            return None
    else:
        return None


def filterTweet(tweet_dict, language, country):  # Apply language, original, country, and trackable filters to a tweet.
    return (filterCountry(tweet_dict, country) and filterLanguage(tweet_dict, language)
            and filterOriginal(tweet_dict) and filterTrackable(tweet_dict))


def is_emoji(s):  # Method to check if a string is an emoji.
    return s in emoji.UNICODE_EMOJI


def add_space(text):  # Method to add a space between word-emoji pairs.
    result = ''
    for char in text:
        if is_emoji(char):
            char += ' '
            result += ' '
        result += char
    return result.strip()


def parseTweetText(tweetDict):  # Method to demojize tweet text, URLs, remove punctuation, and move to lowercase.
    tweet_text = tweetDict["text"]
    tweet_text = tweet_text.replace('\n', " ")
    tweet_text = add_space(tweet_text)
    tweet_text = emoji.demojize(tweet_text)
    filtered_words_1 = []
    for word in re.split(' +', tweet_text):
        if word.startswith('https'):
            continue
        else:
            word = word.lower()
            word = re.sub(r'[^a-z0-9\s.,:;!?_#@]', '', word)
            word = word.strip()
            filtered_words_1.append(word)
    filtered_sentence_1 = ' '.join(filtered_words_1)
    filtered_words_2 = []
    filtered_sentence_2 = negation_sub(filtered_sentence_1)
    for word in re.split(' +', filtered_sentence_2):
        word = re.sub(r'[.,:;!?]', '', word)
        word = re.sub(r'[#@]', ' ', word)
        while (word.startswith(" ") or word.endswith(" ")):
            word = word.strip()
        while (word.startswith("_") or word.endswith("_")):
            word = word.strip("_")
        if (word != ""):
            filtered_words_2.append(word)
    return ' '.join(filtered_words_2)


def negation_sub(text):
    transformed = re.sub(r'\b(?:not|never|aint|doesnt|havent|lacks|none|mightnt|shouldnt|'
                         r'cannot|dont|neither|nor|mustnt|wasnt|cant|hadnt|isnt|neednt|without|'
                         r'darent|hardly|lack|nothing|oughtnt|wouldnt|didnt|hasnt|lacking|nobody|'
                         r'nowhere|shant)\b[\w\s]+[.,:;!?#@]',
                         lambda match: re.sub(r'(\s+)(\w+)', r'\1NEG_\2', match.group(0)),
                         text,
                         flags=re.IGNORECASE)
    return transformed


def checkSleepWords(tweet_dict, list_sleep_words,
                    list_sleep_bigrams):  # Method to check for existence of sleep keywords/bigrams in filtered text.
    # Takes tweet dictionary, a list of sleep words, and list of sleep bigrams to compare to.
    tweet_dict_words = tweet_dict.filtered_text.split()
    tweet_dict_bigrams = list(bigrams(tweet_dict_words))
    for word in tweet_dict_words:
        if word in list_sleep_words:
            return True

    for each_bigram in tweet_dict_bigrams:
        for sleep_bigram in list_sleep_bigrams:
            if each_bigram[0] == sleep_bigram[0] and each_bigram[1] == sleep_bigram[1]:
                return True

    return False


def checkStressWords(tweet_dict,
                     list_stress_words):  # Method to check for existence of stress keywords in filtered text.
    # Takes a tweet dictionary and a list of stress words to compare to.
    tweet_dict_words = tweet_dict.filtered_text.split()
    for word in tweet_dict_words:
        if word in list_stress_words:
            return True
    return False


def province_filter(
        filtered_tweet):  # Method to correct the province field of a filtered tweet.  Overwrites the filtered_tweet
    # passed to it.
    list_to_remove = ["Minnesota", "Michigan", "Iowa", "Alaska"]
    list_provinces_to_replace = ["Subd. V", "Nouveau-Brunswick", "Washington", "Montana", "North Dakota"]
    list_replacements = ["Newfoundland and Labrador", "New Brunswick", "Alberta", "Saskatchewan", "Manitoba"]
    replacement_dict = dict(zip(list_provinces_to_replace, list_replacements))

    if filtered_tweet.place_province in replacement_dict.keys():
        filtered_tweet.place_province = replacement_dict[filtered_tweet.place_province]
    elif filtered_tweet.place_province in list_to_remove:
        filtered_tweet = None
    return filtered_tweet


filename = "Twitter_Dataset.json"  # Name of .json dataset to pass.  Should have 1 tweet per line of file.
df = spark.read.json(filename)
df.show()
# filename = "E:/Machine Learning Data/New/1.json"  # Name of .json dataset to pass.  Should have 1 tweet per line of file.

# output_filename = "Cleaned_twitter_dataset.json"  # Name of clean .json file to write to.  Will have 1 clean tweet per line of file.
# 
# list_non_provinces = ['Canada', 'Montréal', 'Subd. A', 'Subd. C', 'Unorganized',
#                       'Toronto', 'Vancouver', 'Subd. Y', 'Subd. B', 'Subd. O', '143', 'Subd. D', 'Calgary',
#                       'Nouveau-Brunswick', 'Subd. V',
#                       'Subd. E']  # List of non-provinces in original tweet to be corrected.
# list_sleep_words = ["bed", "sack", "insomnia", "dodo", "zzz", "siesta", "tired", "nosleep",
#                     "cantsleep", "rest", "asleep", "slept", "sleeping", "sleepy",
#                     "ambien", "zolpidem", "lunesta", "intermezzo", "trazadone", "eszopiclone",
#                     "zaleplon"]  # List of sleep words to check for.
# list_sleep_bigrams = [["pass", "out"], ["get", "up"], ["wake", "up"],
#                       ["power", "nap"]]  # List of sleep bigrams to compare tweet text to.
# list_stress_words = ["heart", "control", "depression", "disease", "study", "studies", "life"
#                                                                                       "stressor", "body", "stress",
#                      "anxiety", "health", "feel", "pain",
#                      "social", "stressors", "pressure", "work", "risk", "stressful", "busy",
#                      "depressed", "nervous", "university", "cancer", "marry", "wedding", "bride",
#                      "income", "salary", "rent", "hospital", "sick", "school", "holiday", "finals",
#                      "born", "baby", "life", "fired", "job", "lose", "cold", "war", "quarrel", "argue"
#                                                                                                "question", "blame",
#                      "afraid", "baby", "pregnant", "mother-to-be", "revise",
#                      "habits", "smoke", "drink", "pass", "away", "RIP", "divorce", "ex-wife", "cry",
#                      "surgery"]  # List of stress keywords to compare tweet text to.
# 
# with open(filename, 'r', encoding='utf8', errors='ignore') as f:  # Open input file
#     with open(output_filename, 'w', encoding='utf8') as outputFile:  # Open output file
#         for (i, line) in enumerate(f, 1):  # Iterate through each line in input file.
#             tweet_dict = json.loads(line)  # Load the .json object into a dictionary.
#             if filterTweet(tweet_dict, "en", "Canada"):  # Filter the tweet.
#                 new_tweet = FilteredTweet(tweet_dict)  # Create a new filtered tweet.
#                 if new_tweet.place_province is None or new_tweet.place_province in list_non_provinces:  # Correct the province if necessary
#                     new_tweet.setProvince()
#                 new_tweet = province_filter(new_tweet)  # Check that province has been corrected.
#                 if new_tweet is not None:
#                     if (checkSleepWords(new_tweet, list_sleep_words, list_sleep_bigrams)) \
#                             or (
#                             checkStressWords(new_tweet, list_stress_words)):  # Check for sleep/stress keywords/bigrams.
#                         new_tweet.increment()  # Set the index.
#                         new_tweet_json = json.dumps(
#                             new_tweet.__dict__)  # Create a new json object from the clean tweet.
#                         clean_tweet_json = json.dumps(new_tweet.__dict__, indent=4)
#                         print(clean_tweet_json)
#                         outputFile.write(new_tweet_json)  # Write the new json object to the output file.
#                         outputFile.write('\n')  # Write a newline to separate tweets in output file.
#                         print("Current tweets written:", FilteredTweet.counter, ", Tweets evaluated:", i)
#             # if i == 1000000:
#             #     break
# 
# print("****************************************")
# print("Total clean tweets written: ", FilteredTweet.counter)
# print("Total tweets evaluated: ", i)
