In [1]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2022-05-01 06:09:29--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://setup.johnsnowlabs.com/colab.sh [following]
--2022-05-01 06:09:29--  https://setup.johnsnowlabs.com/colab.sh
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2022-05-01 06:09:30--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:44

In [28]:
import pandas as pd
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.sql.functions import *
from sklearn.metrics import classification_report, accuracy_score, f1_score,roc_auc_score

In [4]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sun May  1 06:10:04 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
import sparknlp
spark = sparknlp.start(gpu=True)
spark

print("Spark NLP version: {}".format(sparknlp.version()))
print("Apache Spark version: {}".format(spark.version))

Spark NLP version: 3.4.3
Apache Spark version: 3.0.3


In [7]:
input_path_hk = "/content/drive/MyDrive/Tweets/Tweets_SG_HK_AUS/HKTweetsHydrated.csv"
df = pd.read_csv(input_path_hk)

In [8]:
df.isnull().sum()

tweet_id                   0
user_id                    0
tweet_timestamp            0
keyword                    0
country/region             0
valence_intensity          0
fear_intensity             0
anger_intensity            0
happiness_intensity        0
sadness_intensity          0
sentiment                  0
emotion                    0
text                   22761
dtype: int64

In [9]:
df_updated = df.dropna()

In [10]:
df_updated.isnull().sum()

tweet_id               0
user_id                0
tweet_timestamp        0
keyword                0
country/region         0
valence_intensity      0
fear_intensity         0
anger_intensity        0
happiness_intensity    0
sadness_intensity      0
sentiment              0
emotion                0
text                   0
dtype: int64

In [11]:
sparkDF=spark.createDataFrame(df_updated) 
sparkDF.printSchema()
sparkDF.show()


root
 |-- tweet_id: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- tweet_timestamp: string (nullable = true)
 |-- keyword: string (nullable = true)
 |-- country/region: string (nullable = true)
 |-- valence_intensity: double (nullable = true)
 |-- fear_intensity: double (nullable = true)
 |-- anger_intensity: double (nullable = true)
 |-- happiness_intensity: double (nullable = true)
 |-- sadness_intensity: double (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- emotion: string (nullable = true)
 |-- text: string (nullable = true)

+-------------------+-------------------+-------------------+-------+--------------+------------------+--------------+---------------+-------------------+------------------+----------------+-------------------+--------------------+
|           tweet_id|            user_id|    tweet_timestamp|keyword|country/region| valence_intensity|fear_intensity|anger_intensity|happiness_intensity| sadness_intensity|       sentiment|       

In [12]:
spark_df = sparkDF.selectExpr("text","sentiment")
spark_df.show()

+--------------------+----------------+
|                text|       sentiment|
+--------------------+----------------+
|It wouldn’t be su...|        negative|
|Latest, another j...|        negative|
|@niubi Their pare...|neutral or mixed|
|Confusion as #WHO...|        negative|
|6 provinces outsi...|        positive|
|😷?

“Masks are u...|        negative|
|What China eats i...|        negative|
|So the outbreak m...|        negative|
|@jorge_guajardo @...|        negative|
|@zlj517 Amazing ,...|        positive|
|Prof @gmleunghku ...|        negative|
|@IncomeDisparity ...|        negative|
|Hong Kong actor L...|        positive|
|Despite the lack ...|        negative|
|Funny but so true...|        positive|
|#WHO encouraged t...|        negative|
|Foreigners prepar...|        negative|
|@SalonTrans @Solo...|neutral or mixed|
|Wuhan is just a m...|neutral or mixed|
|@GuoLibrary The r...|   very negative|
+--------------------+----------------+
only showing top 20 rows



In [13]:
spark_df = spark_df.withColumn('text',regexp_replace('text',r'http\S+',''))
spark_df = spark_df.withColumn('text',regexp_replace('text','@\w+',''))
spark_df = spark_df.withColumn('text',regexp_replace('text','#',''))
spark_df = spark_df.withColumn('text',regexp_replace('text','RT',''))
spark_df = spark_df.withColumn('text',regexp_replace('text','&amp;',''))
spark_df = spark_df.withColumn('text',regexp_replace('text','&quot;',''))
spark_df = spark_df.withColumn('text',regexp_replace('text','&gt',''))
spark_df = spark_df.withColumn('text',regexp_replace('text','&lt',''))


In [14]:
spark_df_updated = spark_df.withColumn("sentiments",
       when((col("sentiment") == "very negative"), "negative")
      .when((col("sentiment") == "very positive"), "positive")
      .when((col("sentiment") == "neutral or mixed"), "neutral")
      .otherwise(col("sentiment"))).selectExpr('text','sentiments')

In [15]:
# Remove neutral
spark_df_filtered=spark_df_updated.filter(spark_df_updated.sentiments != "neutral")

In [16]:
spark_df_filtered.groupBy("sentiments") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+----------+-----+
|sentiments|count|
+----------+-----+
|  negative|70685|
|  positive|32474|
+----------+-----+



# Undersampling

In [17]:
# Undersampling
major_df = spark_df_filtered.filter(col("sentiments") == "negative")
minor_df = spark_df_filtered.filter(col("sentiments") == "positive")
ratio = int(major_df.count()/minor_df.count())
print("ratio: {}".format(ratio))
sampled_majority_df = major_df.sample(False,1/ratio)
combined_df = sampled_majority_df.unionAll(minor_df)

ratio: 2


In [18]:
combined_df.groupBy("sentiments") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+----------+-----+
|sentiments|count|
+----------+-----+
|  negative|35250|
|  positive|32474|
+----------+-----+



# Training the model

In [19]:
def get_embeddings(type):
  if type == "glove":
    return WordEmbeddingsModel.pretrained().setInputCols(["document","lemma"]).setOutputCol("embeddings").setCaseSensitive(False)
  elif type == "bert":
    return BertEmbeddings.pretrained('bert_base_cased', 'en').setInputCols(["document",'lemma']).setOutputCol("embeddings")
  elif type == "elmo":
    return ElmoEmbeddings.pretrained('elmo', 'en').setInputCols(["document",'lemma']).setOutputCol("embeddings")

In [20]:
documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document").setCleanupMode("shrink")
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")
normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalized")
stopwords_cleaner = StopWordsCleaner().setInputCols(["normalized"]).setOutputCol("cleanTokens").setCaseSensitive(False)
lemma = LemmatizerModel.pretrained("lemma_antbnc").setInputCols(["cleanTokens"]).setOutputCol("lemma")
word_embeddings = get_embeddings("elmo")
embeddings_sentence = SentenceEmbeddings().setInputCols(["document","embeddings"]).setOutputCol("sentence_embeddings").setPoolingStrategy("AVERAGE")


lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
elmo download started this may take some time.
Approximate size to download 334.1 MB
[OK!]


In [21]:
classifierdl = ClassifierDLApproach()\
               .setInputCols(["sentence_embeddings"])\
               .setOutputCol("class")\
               .setLabelColumn("sentiments")\
               .setBatchSize(64)\
               .setMaxEpochs(10)\
               .setLr(1e-3)\
               .setEnableOutputLogs(True) 

In [22]:
clf_pipeline = Pipeline(stages=[documentAssembler,tokenizer,normalizer,stopwords_cleaner,lemma,word_embeddings,embeddings_sentence,classifierdl])

In [23]:
train, test = combined_df.randomSplit([0.7, 0.3], seed=42)

In [24]:
clf_pipelineModel = clf_pipeline.fit(train)

In [25]:
preds = clf_pipelineModel.transform(test)

In [26]:
preds.select('text','sentiments','class.result').show(10,truncate=80)

+--------------------------------------------------------------------------------+----------+----------+
|                                                                            text|sentiments|    result|
+--------------------------------------------------------------------------------+----------+----------+
|

Of course CCP outlets would deliberately left out how this team from Hong K...|  negative|[negative]|
| 

We must not let the CCP rewrite the history of COVID-19, aka Wuhan pneumonia.|  negative|[negative]|
| 

We must not let the CCP rewrite the history of COVID-19, aka Wuhan pneumonia.|  negative|[negative]|
|

WolfOnWallstreet SellThisPen 

"SupplyAndDemand" Corona FaceMask shortage i...|  negative|[negative]|
|
   ChineseVirus HK government again chartered to accept the HK people from W...|  negative|[negative]|
|
. Really I think Congress is better than BJP, at least ideologically. But ch...|  negative|[negative]|
|
A Hong Kong Newspaper claimed that COVID-19 aka Chine

In [27]:

df = clf_pipelineModel.transform(test).select('text','sentiments','class.result').toPandas()
df['result'] = df['result'].apply(lambda x:x[0])


In [29]:
print("Classification report")
print(classification_report(df.sentiments,df.result))
print('\n')
print("Accuracy Metrics")
print(accuracy_score(df.sentiments,df.result))
print('\n')


Classification report
              precision    recall  f1-score   support

    negative       0.81      0.84      0.82     10306
    positive       0.82      0.79      0.81      9686

    accuracy                           0.82     19992
   macro avg       0.82      0.82      0.82     19992
weighted avg       0.82      0.82      0.82     19992



Accuracy Metrics
0.8164765906362546




In [30]:
# Save model
clf_pipelineModel.stages[-1].write().overwrite().save('/content/drive/MyDrive/CS5425_Big_Data_Project/Models/hk_elmo_dl_model')