<div style="font-size: 17px">
    <b> TASK 1 </b>
</div >

In [1]:
from pymongo import MongoClient 
import nltk 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords 
import re
import json

In [2]:
def download_nltk(): 
    """need to download these once before running any of their related methods on a computer""" 
    nltk.download("stopwords") #used for stopwords 
    nltk.download("punkt") #used for tonkenising

*CURATING DATA*

In [3]:
def remove_digits(text):
    """returns words without numbers"""
    return re.sub(r"[\d]", " ", text)

def remove_URL(text):
    """returns words without URL links"""
    text = " ".join(x for x in text.split() if not (x.startswith("https:") or x.startswith("www.")))
    text = re.sub(r'[\w]+.com', " ", text)
    return text

def remove_tagging(text):
    """returns words without taggings"""
    return " ".join(x for x in text.split() if not x.startswith("@"))

def remove_non_alphabetical(text):
    """returns words without non-alphanumeric characters such as emojis and punctuations"""
    return re.sub(r"[^\w]"," ",text)

def remove_email(text):
    """returns words without email addresses"""
    return re.sub(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", " ", text)

def remove_non_english_words(text):
    """returns words without non-english terms"""
    words = set(nltk.corpus.words.words())
    return " ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in words or not w.isalpha())

def remove_single_character(text):
    """returns words without single letters such as 'r', 'e', 'h'"""
    return " ".join(x for x in text.split() if len(x) > 1)
    
def join_single_space(text):
    """returns single-spaced words"""
    text = text.strip()
    return " ".join(word_tokenize((text)))

def curate_body(text):
    text = remove_email(text)
    text = remove_URL(text)
    text = remove_tagging(text)
    text = remove_digits(text)
    text = remove_non_english_words(text)
    text = remove_non_alphabetical(text)
    text = remove_single_character(text)
    text = join_single_space(text)
    return text
    
#accesses database and its collection '10000 tweets' 
client = MongoClient("localhost:27017")
db = client["Assignment_1"]

download_nltk() #only necessary on first time the script is run, left in to demonstrate

for t in db["10000 tweets"].find({},{"body": 1,"_id": 1}):
    curated = curate_body(t["body"])
    db["10000 tweets"].update_many({"_id": t["_id"]}, {"$set":{"curated body": str(curated)}})

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\61469\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\61469\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


*EXTRACTING KEYWORDS*

In [4]:
def remove_stopwords(words): 
    """returns list of words without inconsequetial/unimportant/common words""" 
    result = [] 
    for w in words: 
        if w not in stopwords.words("english"): 
            result.append(w) 
    return result 

def find_keywords_in_text(text): 
    words = word_tokenize(text) #convert into list of words 
    keywords = remove_stopwords(words) #remove stopwords 
    return keywords 

def keywords_to_csv(keywords): 
    """takes in a list of keywords and converts to CSV format""" 
    csv: str = "" 
    for index, keyword in enumerate(keywords): 
        if index != (len(keywords) - 1):
            csv = csv + keyword + ","
        else:
            csv = csv + keyword
    return csv

#Loops through each tweet, calls various functions to help in extracting keywords, 
#then updates the database with the keywords in CSV format 
for t in db["10000 tweets"].find({}, {"curated body": 1, "_id": 1}):
    keywords = find_keywords_in_text(t["curated body"])
    csv = keywords_to_csv(keywords)
    db["10000 tweets"].update_many({"_id": t["_id"]}, {"$set": {"keyword": str(csv)}})

<div style="font-size: 17px">
    <b> TASK 2 </b>
</div >

*CURATING DATA - PREPROCESSING*

In [5]:
for t in db["10000 tweets"].find({},{"curated body": 1, "_id": 0}):
    with open('curated_body_tweet.txt', 'a+', encoding='utf-8') as f:
        f.write(str(t["curated body"]) + "\n")

<div style="font-size: 17px">
    <b> TASK 3 </b>
</div >

*CURATING DATA*

In [6]:
def remove_timezone(text):
    """returns words without timezone terms, i.e. 'Pacific Time', that bear little location meaning in actor.twitterTimeZone"""
    return re.sub(r'[\w]+[\s]+Time', " ", text)

def capitalise_words(text):
    """returns all words capitalised for downstream location name identification"""
    return text.title()

def curate_location(text):
    text = remove_email(text)
    text = remove_URL(text)
    text = remove_tagging(text)
    text = remove_timezone(text)
    text = remove_digits(text)
    text = remove_non_alphabetical(text)
    text = remove_single_character(text)
    text = capitalise_words(text)
    text = join_single_space(text)
    return text

#accesses database and its collection '10000 tweets' 
client = MongoClient("localhost:27017")
db = client["Assignment_1"]

for t in db["10000 tweets"].find({},{"actor": 1, "_id": 1}):
    if (db["10000 tweets"].count_documents({"$and": [{"_id": t["_id"]}, {"actor.twitterTimeZone": {"$exists": "true"}}]}) > 0) or (db["10000 tweets"].count_documents({"$and": [{"_id": t["_id"]}, {"actor.location.displayName": {"$exists": "true"}}]}) > 0):
        if (db["10000 tweets"].count_documents({"$and": [{"_id": t["_id"]}, {"actor.twitterTimeZone": {"$exists": "true"}}]}) > 0) and (db["10000 tweets"].count_documents({"$and": [{"_id": t["_id"]}, {"actor.location.displayName": {"$exists": "true"}}]}) > 0):
            curated_timezone = curate_location(str(t["actor"]["twitterTimeZone"]))
            curated_displayName = curate_location(str(t["actor"]["location"]["displayName"]))
        elif (db["10000 tweets"].count_documents({"$and": [{"_id": t["_id"]}, {"actor.twitterTimeZone": {"$exists": "true"}}]}) == 0) and (db["10000 tweets"].count_documents({"$and": [{"_id": t["_id"]}, {"actor.location.displayName": {"$exists": "true"}}]}) > 0):
            curated_timezone = "None"
            curated_displayName = curate_location(str(t["actor"]["location"]["displayName"]))
        elif (db["10000 tweets"].count_documents({"$and": [{"_id": t["_id"]}, {"actor.twitterTimeZone": {"$exists": "true"}}]}) > 0) and (db["10000 tweets"].count_documents({"$and": [{"_id": t["_id"]}, {"actor.location.displayName": {"$exists": "true"}}]}) == 0): 
            curated_timezone = curate_location(str(t["actor"]["twitterTimeZone"]))
            curated_displayName = "None"
        db["10000 tweets"].update_many({"_id":t["_id"]},{"$set":{"curated twitterTimeZone": str(curated_timezone)}})
        db["10000 tweets"].update_many({"_id":t["_id"]},{"$set":{"curated displayName": str(curated_displayName)}})
        db["10000 tweets"].update_many({"_id":t["_id"]},{"$set":{"curated location": str(curated_timezone) + " , " + str(curated_displayName)}})

In [7]:
for t in db["10000 tweets"].find({},{"curated location": 1, "_id": 0}):
    with open('tweet_city.txt', 'a+', encoding='utf-8') as f:
        f.write(str(t["curated location"] + "\n"))

<div style="font-size: 17px">
    <b> TASK 4 </b>
</div >

*EXPLORING DATA*

A brief analysis of the "object.id" values reveals that most of them have "object:search.twitter.com,2005:" or "tag:search.twitter.com,2005:". As this information does not contribute to the sorting of the "object.id" values, we can check whether all 10000 tweets' object id have this characteristic for the sake of removal.

In [8]:
db["10000 tweets"].count_documents({"object.id" : {"$regex": ":search.twitter.com,2005:"}})

10000

As the output shows that all 10000 tweets have either ""object:search.twitter.com,2005:" or "tag:search.twitter.com,2005:", we can curate the "object.id" values by removing them for ease of downstream sorting.

*CURATING DATA*

In [9]:
def remove_id_non_essentials(text):
    """returns object ids without 'object:search.twitter.com,2005:' or 'tag:search.twitter.com,2005:'"""
    return re.sub(r'[\w]+:search.twitter.com,2005:', "", text)

for t in db["10000 tweets"].find({},{"object": 1, "_id": 1}):
    curated = remove_id_non_essentials(t["object"]["id"])
    db["10000 tweets"].update_many({"_id":t["_id"]},{"$set":{"curated object id": str(curated)}})

In [10]:
for t in db["10000 tweets"].find({}, {"curated object id": 1, "_id": 0}):
    with open('curated_tweet_object_id.txt', 'a+', encoding='utf-8') as f:
        f.write(str(t["curated object id"]) + "\n")

In [11]:
for t in db["10000 tweets"].find({}, {"curated object id": 1, "_id": 0}):  
    json_string = json.dumps([int(t["curated object id"])])
    with open("curated_tweet_object_id.json", "a+") as jsonFile:
        jsonFile.write(json_string + "\n")

<div style="font-size: 17px">
    <b> TASK 5 </b>
</div >

In [12]:
for t in db["10000 tweets"].find({}, {"curated body": 1, "_id": 0}):  
    json_string = json.dumps([str(t["curated body"])])
    with open("curated_body_tweet.json", "a+") as jsonFile:
        jsonFile.write(json_string + "\n")