In [1]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2022-04-18 05:50:19--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://setup.johnsnowlabs.com/colab.sh [following]
--2022-04-18 05:50:20--  https://setup.johnsnowlabs.com/colab.sh
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2022-04-18 05:50:21--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:44

In [3]:
import pandas as pd
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.sql.functions import *
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import IDF
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.classification import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [4]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Apr 18 05:50:55 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
import sparknlp
spark = sparknlp.start(gpu=True)
spark

print("Spark NLP version: {}".format(sparknlp.version()))
print("Apache Spark version: {}".format(spark.version))

Spark NLP version: 3.4.3
Apache Spark version: 3.0.3


In [9]:
input_path = "/content/drive/MyDrive/BigData/Data/Tweets_SG_HK_AUS/HKTweetsHydrated.csv"
df = pd.read_csv(input_path)

# Preprocessing

In [10]:
df.isnull().sum()

tweet_id                   0
user_id                    0
tweet_timestamp            0
keyword                    0
country/region             0
valence_intensity          0
fear_intensity             0
anger_intensity            0
happiness_intensity        0
sadness_intensity          0
sentiment                  0
emotion                    0
text                   22761
dtype: int64

In [11]:
df_updated = df.dropna()

In [12]:
df_updated.isnull().sum()

tweet_id               0
user_id                0
tweet_timestamp        0
keyword                0
country/region         0
valence_intensity      0
fear_intensity         0
anger_intensity        0
happiness_intensity    0
sadness_intensity      0
sentiment              0
emotion                0
text                   0
dtype: int64

In [13]:
sparkDF=spark.createDataFrame(df_updated) 
sparkDF.printSchema()
sparkDF.show()


root
 |-- tweet_id: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- tweet_timestamp: string (nullable = true)
 |-- keyword: string (nullable = true)
 |-- country/region: string (nullable = true)
 |-- valence_intensity: double (nullable = true)
 |-- fear_intensity: double (nullable = true)
 |-- anger_intensity: double (nullable = true)
 |-- happiness_intensity: double (nullable = true)
 |-- sadness_intensity: double (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- emotion: string (nullable = true)
 |-- text: string (nullable = true)

+-------------------+-------------------+-------------------+-------+--------------+------------------+--------------+---------------+-------------------+------------------+----------------+-------------------+--------------------+
|           tweet_id|            user_id|    tweet_timestamp|keyword|country/region| valence_intensity|fear_intensity|anger_intensity|happiness_intensity| sadness_intensity|       sentiment|       

In [14]:
spark_df = sparkDF.selectExpr("text","sentiment")
spark_df.show()

+--------------------+----------------+
|                text|       sentiment|
+--------------------+----------------+
|It wouldn’t be su...|        negative|
|Latest, another j...|        negative|
|@niubi Their pare...|neutral or mixed|
|Confusion as #WHO...|        negative|
|6 provinces outsi...|        positive|
|😷?

“Masks are u...|        negative|
|What China eats i...|        negative|
|So the outbreak m...|        negative|
|@jorge_guajardo @...|        negative|
|@zlj517 Amazing ,...|        positive|
|Prof @gmleunghku ...|        negative|
|@IncomeDisparity ...|        negative|
|Hong Kong actor L...|        positive|
|Despite the lack ...|        negative|
|Funny but so true...|        positive|
|#WHO encouraged t...|        negative|
|Foreigners prepar...|        negative|
|@SalonTrans @Solo...|neutral or mixed|
|Wuhan is just a m...|neutral or mixed|
|@GuoLibrary The r...|   very negative|
+--------------------+----------------+
only showing top 20 rows



In [15]:
# Preprocessing
spark_df = spark_df.withColumn('text',regexp_replace('text',r'http\S+',''))
spark_df = spark_df.withColumn('text',regexp_replace('text','@\w+',''))
spark_df = spark_df.withColumn('text',regexp_replace('text','#',''))
spark_df = spark_df.withColumn('text',regexp_replace('text','RT',''))
spark_df = spark_df.withColumn('text',regexp_replace('text','&amp;',''))
spark_df = spark_df.withColumn('text',regexp_replace('text','&quot;',''))
spark_df = spark_df.withColumn('text',regexp_replace('text','&gt',''))
spark_df = spark_df.withColumn('text',regexp_replace('text','&lt',''))


In [16]:
spark_df_updated = spark_df.withColumn("sentiments",
       when((col("sentiment") == "very negative"), 0)
      .when((col("sentiment") == "very positive"), 1)
      .when((col("sentiment") == "negative"), 0)
      .when((col("sentiment") == "positive"), 1)
      .when((col("sentiment") == "neutral or mixed"), 2)
      .otherwise(col("sentiment"))).selectExpr('text','sentiments')

In [17]:
spark_df_int = spark_df_updated.withColumn("sentiment_int",col("sentiments").cast("int"))

In [18]:
# Remove neutral
spark_df_filtered=spark_df_int.filter(spark_df_int.sentiments != 2)

In [19]:
spark_df_filtered.groupBy("sentiment_int") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+-------------+-----+
|sentiment_int|count|
+-------------+-----+
|            0|70685|
|            1|32474|
+-------------+-----+



# Undersampling

In [20]:
# Undersampling
major_df = spark_df_filtered.filter(col("sentiment_int") == 0)
minor_df = spark_df_filtered.filter(col("sentiment_int") == 1)
ratio = int(major_df.count()/minor_df.count())
print("ratio: {}".format(ratio))
sampled_majority_df = major_df.sample(False,1/ratio)
combined_df = sampled_majority_df.unionAll(minor_df)

ratio: 2


In [21]:
combined_df.groupBy("sentiment_int") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+-------------+-----+
|sentiment_int|count|
+-------------+-----+
|            0|35477|
|            1|32474|
+-------------+-----+



# Training the model

In [22]:
documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document").setCleanupMode("shrink")
sentence_detector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")
tokenizer = Tokenizer().setInputCols(["sentence"]).setOutputCol("token")
stopwords_cleaner = StopWordsCleaner().setInputCols(["token"]).setOutputCol("cleanTokens").setCaseSensitive(False)
normalizer = Normalizer().setInputCols(["cleanTokens"]).setOutputCol("normalized").setLowercase(True)
finisher = Finisher().setInputCols(["normalized"]).setOutputCols(["token_features"]) # To generate Term Frequency
hashingTF=HashingTF(inputCol="token_features", outputCol="rawFeatures")# To generate Inverse Document Frequency
idf = IDF(inputCol="rawFeatures",outputCol="features",minDocFreq=5)

In [23]:
SVC = LogisticRegression(labelCol = "sentiment_int", featuresCol="features", maxIter=13, regParam=0.2)

In [24]:
clf_pipeline = Pipeline(stages=[documentAssembler,sentence_detector,tokenizer,stopwords_cleaner,normalizer,finisher,hashingTF,idf,SVC])

In [25]:
train, test = combined_df.randomSplit([0.7, 0.3], seed=42)

In [26]:
clf_pipelineModel = clf_pipeline.fit(train)

In [27]:
preds = clf_pipelineModel.transform(test)

In [28]:
preds.select('text','sentiment_int','prediction').show(10,truncate=80)

+--------------------------------------------------------------------------------+-------------+----------+
|                                                                            text|sentiment_int|prediction|
+--------------------------------------------------------------------------------+-------------+----------+
|

Of course CCP outlets would deliberately left out how this team from Hong K...|            0|       0.0|
| 

We must not let the CCP rewrite the history of COVID-19, aka Wuhan pneumonia.|            0|       0.0|
|

WolfOnWallstreet SellThisPen 

"SupplyAndDemand" Corona FaceMask shortage i...|            0|       0.0|
|

as someone who lives in Hong Kong and knows a bit about both WuHan's situat...|            0|       0.0|
|
I  have been trapped in hubei for two months. you politicians can attack eac...|            0|       0.0|
|
healthcare 'Fearism 'Philosophy philosopher university universities vaccines...|            0|       0.0|
| 

 

Why HK safe from covi

In [29]:
df = clf_pipelineModel.transform(test).select('text','sentiment_int','prediction').toPandas()

In [30]:
print("Classification report")
print(classification_report(df.sentiment_int,df.prediction))
print('\n')
print("Accuracy Metrics")
print(accuracy_score(df.sentiment_int,df.prediction))
print('\n')


Classification report
              precision    recall  f1-score   support

           0       0.82      0.86      0.84     10368
           1       0.84      0.81      0.82      9686

    accuracy                           0.83     20054
   macro avg       0.83      0.83      0.83     20054
weighted avg       0.83      0.83      0.83     20054



Accuracy Metrics
0.8321033210332104




In [31]:
clf_pipelineModel.save("/content/drive/MyDrive/BigData/Models/svm_sg")