# Regular Expression Resources:

1) https://www.regular-expressions.info/tools.html


2) https://www.freeformatter.com/java-regex-tester.html


3) https://www.freeformatter.com


# What we find while exploring the data in Notebook #15

<ul>
    <li>
        We need to apply a proper schema
    </li>
    <li>
    The date column need fixing
    </li>
    <li>
    We need to extract/remove usernames
    </li>
    <li>
    We need to extract hashtags and replace them with equivalent word
    </li>
    <li>
    We need to remove URLs as our algorithm will not understand them
    </li>
    <li>
    The same goes for email addresses
    </li>
    <li>
    Symbols stored in HTML notation do not appear properly unescaped(example: &lt;)
    </li>
    <li>
    Unwanted characters are present like starts or black dotted shapes
    </li>
</ul>

In [1]:
import pandas as pd
import html
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

spark = (
        SparkSession
        .builder
        .config("spark.sql.legacy.timeParserPolicy","LEGACY")
        .appName("data-cleaning")
        .getOrCreate()
)
#Show unlimited Columns
pd.options.display.max_columns = None
#Show max 250 rows 
pd.options.display.max_rows = 250
#Max col width = 150 as max tweet size is 144
pd.options.display.max_colwidth = 150



# The data is a CSV with emoticons removed. Data file format has 6 fields:
# 0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
# 1 - the id of the tweet (2087)
# 2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009)
# 3 - the query (lyx). If there is no query, then this value is NO_QUERY.
# 4 - the user that tweeted (robotickilldozr)
# 5 - the text of the tweet (Lyx is cool)
schema = "polarity FLOAT, id LONG, date_time TIMESTAMP,query STRING,user STRING,text STRING"
timestampformat = "EEE MMM dd HH:mm:ss zzz yyyy"

#patH for INPUT & OUTPUT
IN_PATH = "datasets/sentiment-140-training-data/RAW"
OUT_PATH = "datasets/sentiment-140-training-data/CLEAN"

#Declaring a common schema
spark_reader = spark.read.schema(schema)

@f.udf
def html_unescape(s: str):
    if isinstance(s, str):
        return html.unescape(s)
    return s

#REGEX
user_regex = r"(@\w{1,15})"
hashtag_regex = r"(#\w{1,})"
url_regex = r"((https?|ftp|file):\/{2,3})+([-\w+&@#/%=~|$?!:,.]*)|(www.)+([-\w+&@#/%=~|$?!:,.]*)"
email_regex = r"[\w.-]+@[\w.-]+\.[a-zA-Z]{1,}"


def clean_data(df):
    df = (
        df
        .withColumn("original_text",f.col("text"))
        .withColumn("text",f.regexp_replace(f.col("text"),url_regex,""))
        .withColumn("text",f.regexp_replace(f.col("text"),email_regex,""))
        .withColumn("text",f.regexp_replace(f.col("text"),user_regex,""))
        .withColumn("text",f.regexp_replace(f.col("text"),"#"," "))
        .withColumn("text",html_unescape(f.col("text")))
        .filter("text != ''")
    )
    return df

df_raw = spark_reader.csv(IN_PATH,timestampFormat=timestampformat)
df_clean = clean_data(df_raw)

df_clean.write.partitionBy("polarity").parquet(OUT_PATH , mode="overwrite")

In [2]:
df_clean.count()

1597303