In [1]:
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('9_NPL3_Sentiment Analysis ').getOrCreate()

file_location = "9_financial_news.csv"
file_type = "csv"
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","
df = spark.read.format(file_type) \
 .option("inferSchema", infer_schema) \
 .option("header", first_row_is_header) \
 .option("sep", delimiter) \
 .load(file_location)
df.show(5,False)


22/10/30 14:42:55 WARN Utils: Your hostname, m0 resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
22/10/30 14:42:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/30 14:42:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/30 14:43:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/10/30 14:43:04 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/10/30 14:43:04 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


                                                                                

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+
|News                                                                                                                                                                                                                                               |Sentiment|
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+
|- BEIJING XFN-ASIA - Hong Kong-listed Standard Chartered Bank said it has signed a China mobile phone dealer financing agreement with Nokia , making it the first foreign bank to offer financing to the country 's small and medium en

In [2]:
df.count()

                                                                                

962

In [3]:
# Remove non ASCII characters

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag
import string
import re

# remove non ASCII characters
def strip_non_ascii(data_str):
    ''' Returns the string without non ASCII characters'''
    stripped = (c for c in data_str if 0 < ord(c) < 127)
    return ''.join(stripped)
# setup pyspark udf function
strip_non_ascii_udf = udf(strip_non_ascii, StringType())

In [4]:
# check the function:
df = df.withColumn('News_non_asci',strip_non_ascii_udf(df['News']))
df.show(5)

                                                                                

+--------------------+---------+--------------------+
|                News|Sentiment|       News_non_asci|
+--------------------+---------+--------------------+
|- BEIJING XFN-ASI...|        1|- BEIJING XFN-ASI...|
|- Operating profi...|        1|- Operating profi...|
|- Provides summar...|        0|- Provides summar...|
|- So , the sales ...|        0|- So , the sales ...|
|- UPM-Kymmene upg...|        1|- UPM-Kymmene upg...|
+--------------------+---------+--------------------+
only showing top 5 rows



In [5]:
# fixed abbreviation
def fix_abbreviation(data_str):
    data_str = data_str.lower()
    data_str = re.sub(r'\bthats\b', 'that is', data_str)
    data_str = re.sub(r'\bive\b', 'i have', data_str)
    data_str = re.sub(r'\bim\b', 'i am', data_str)
    data_str = re.sub(r'\bya\b', 'yeah', data_str)
    data_str = re.sub(r'\bcant\b', 'can not', data_str)
    data_str = re.sub(r'\bdont\b', 'do not', data_str)
    data_str = re.sub(r'\bwont\b', 'will not', data_str)
    data_str = re.sub(r'\bid\b', 'i would', data_str)
    data_str = re.sub(r'wtf', 'what the fuck', data_str)
    data_str = re.sub(r'\bwth\b', 'what the hell', data_str)
    data_str = re.sub(r'\br\b', 'are', data_str)
    data_str = re.sub(r'\bu\b', 'you', data_str)
    data_str = re.sub(r'\bk\b', 'OK', data_str)
    data_str = re.sub(r'\bsux\b', 'sucks', data_str)
    data_str = re.sub(r'\bno+\b', 'no', data_str)
    data_str = re.sub(r'\bcoo+\b', 'cool', data_str)
    data_str = re.sub(r'rt\b', '', data_str)
    data_str = data_str.strip()
    return data_str

fix_abbreviation_udf = udf(fix_abbreviation, StringType())

In [6]:
df = df.withColumn('News_fixed_abbrev',fix_abbreviation_udf(df['News']))
df.show(5,True)

                                                                                

+--------------------+---------+--------------------+--------------------+
|                News|Sentiment|       News_non_asci|   News_fixed_abbrev|
+--------------------+---------+--------------------+--------------------+
|- BEIJING XFN-ASI...|        1|- BEIJING XFN-ASI...|- beijing xfn-asi...|
|- Operating profi...|        1|- Operating profi...|- operating profi...|
|- Provides summar...|        0|- Provides summar...|- provides summar...|
|- So , the sales ...|        0|- So , the sales ...|- so , the sales ...|
|- UPM-Kymmene upg...|        1|- UPM-Kymmene upg...|- upm-kymmene upg...|
+--------------------+---------+--------------------+--------------------+
only showing top 5 rows



In [7]:
def remove_features(data_str):
    # compile regex
    url_re = re.compile('https?://(www.)?\w+\.\w+(/\w+)*/?')
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))
    num_re = re.compile('(\\d+)')
    mention_re = re.compile('@(\w+)')
    alpha_num_re = re.compile("^[a-z0-9_.]+$")
    # convert to lowercase
    data_str = data_str.lower()
    # remove hyperlinks
    data_str = url_re.sub(' ', data_str)
    # remove @mentions
    data_str = mention_re.sub(' ', data_str)
    # remove puncuation
    data_str = punc_re.sub(' ', data_str)
    # remove numeric 'words'
    data_str = num_re.sub(' ', data_str)
    # remove non a-z 0-9 characters and words shorter than 1 characters
    list_pos = 0
    cleaned_str = ''
    for word in data_str.split():
        if list_pos == 0:
            if alpha_num_re.match(word) and len(word) > 1:
                cleaned_str = word
            else:
                cleaned_str = ' '
        else:
            if alpha_num_re.match(word) and len(word) > 1:
                cleaned_str = cleaned_str + ' ' + word
            else:
                cleaned_str += ' '
        list_pos += 1
    # remove unwanted space, *.split() will automatically split on
    # whitespace and discard duplicates, the " ".join() joins the
    # resulting list into one string.
    return " ".join(cleaned_str.split())
# setup pyspark udf function
remove_features_udf = udf(remove_features, StringType())

In [8]:
df = df.withColumn('News_Cleaned',remove_features_udf(df['News_fixed_abbrev']))
df.show(5,True)

                                                                                

+--------------------+---------+--------------------+--------------------+--------------------+
|                News|Sentiment|       News_non_asci|   News_fixed_abbrev|        News_Cleaned|
+--------------------+---------+--------------------+--------------------+--------------------+
|- BEIJING XFN-ASI...|        1|- BEIJING XFN-ASI...|- beijing xfn-asi...|beijing xfn asia ...|
|- Operating profi...|        1|- Operating profi...|- operating profi...|operating profit ...|
|- Provides summar...|        0|- Provides summar...|- provides summar...|provides summary ...|
|- So , the sales ...|        0|- So , the sales ...|- so , the sales ...|so the sales grow...|
|- UPM-Kymmene upg...|        1|- UPM-Kymmene upg...|- upm-kymmene upg...|upm kymmene upgra...|
+--------------------+---------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [9]:
# Sentiment Analysis main function
!pip install textblob
from pyspark.sql.types import FloatType
from textblob import TextBlob

def sentiment_analysis(text):
    return TextBlob(text).sentiment.polarity

sentiment_analysis_udf = udf(sentiment_analysis , FloatType())
df  = df.withColumn("sentiment_score", sentiment_analysis_udf( df['News_Cleaned'] ))
df.show(5,True)



[Stage 8:>                                                          (0 + 1) / 1]

+--------------------+---------+--------------------+--------------------+--------------------+---------------+
|                News|Sentiment|       News_non_asci|   News_fixed_abbrev|        News_Cleaned|sentiment_score|
+--------------------+---------+--------------------+--------------------+--------------------+---------------+
|- BEIJING XFN-ASI...|        1|- BEIJING XFN-ASI...|- beijing xfn-asi...|beijing xfn asia ...|       -0.03125|
|- Operating profi...|        1|- Operating profi...|- operating profi...|operating profit ...|            0.6|
|- Provides summar...|        0|- Provides summar...|- provides summar...|provides summary ...|            0.0|
|- So , the sales ...|        0|- So , the sales ...|- so , the sales ...|so the sales grow...|            0.1|
|- UPM-Kymmene upg...|        1|- UPM-Kymmene upg...|- upm-kymmene upg...|upm kymmene upgra...|            0.0|
+--------------------+---------+--------------------+--------------------+--------------------+---------

                                                                                

In [11]:
df.show()

[Stage 9:>                                                          (0 + 1) / 1]

+--------------------+---------+--------------------+--------------------+--------------------+---------------+
|                News|Sentiment|       News_non_asci|   News_fixed_abbrev|        News_Cleaned|sentiment_score|
+--------------------+---------+--------------------+--------------------+--------------------+---------------+
|- BEIJING XFN-ASI...|        1|- BEIJING XFN-ASI...|- beijing xfn-asi...|beijing xfn asia ...|       -0.03125|
|- Operating profi...|        1|- Operating profi...|- operating profi...|operating profit ...|            0.6|
|- Provides summar...|        0|- Provides summar...|- provides summar...|provides summary ...|            0.0|
|- So , the sales ...|        0|- So , the sales ...|- so , the sales ...|so the sales grow...|            0.1|
|- UPM-Kymmene upg...|        1|- UPM-Kymmene upg...|- upm-kymmene upg...|upm kymmene upgra...|            0.0|
|( ADP News ) - De...|        1|( ADP News ) - De...|( adp news ) - de...|adp news dec finn...|         

                                                                                

In [12]:
df.selectExpr("cast(sentiment_score as float) sentiment_score")
df.printSchema()
df.show()

root
 |-- News: string (nullable = true)
 |-- Sentiment: string (nullable = true)
 |-- News_non_asci: string (nullable = true)
 |-- News_fixed_abbrev: string (nullable = true)
 |-- News_Cleaned: string (nullable = true)
 |-- sentiment_score: float (nullable = true)



[Stage 10:>                                                         (0 + 1) / 1]

+--------------------+---------+--------------------+--------------------+--------------------+---------------+
|                News|Sentiment|       News_non_asci|   News_fixed_abbrev|        News_Cleaned|sentiment_score|
+--------------------+---------+--------------------+--------------------+--------------------+---------------+
|- BEIJING XFN-ASI...|        1|- BEIJING XFN-ASI...|- beijing xfn-asi...|beijing xfn asia ...|       -0.03125|
|- Operating profi...|        1|- Operating profi...|- operating profi...|operating profit ...|            0.6|
|- Provides summar...|        0|- Provides summar...|- provides summar...|provides summary ...|            0.0|
|- So , the sales ...|        0|- So , the sales ...|- so , the sales ...|so the sales grow...|            0.1|
|- UPM-Kymmene upg...|        1|- UPM-Kymmene upg...|- upm-kymmene upg...|upm kymmene upgra...|            0.0|
|( ADP News ) - De...|        1|( ADP News ) - De...|( adp news ) - de...|adp news dec finn...|         

                                                                                

In [None]:
# Exercise: Classify the sentiment_score
# <0 ==> Negative
# =0 ==> Neutral
# >0 ==> Posite 
    