In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!apt-get install openjdk-8-jre
!apt-get install scala
!pip install py4j
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark

Reading package lists... Done
Building dependency tree       
Reading state information... Done
openjdk-8-jre is already the newest version (8u312-b07-0ubuntu1~18.04).
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
scala is already the newest version (2.11.12-4~18.04).
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.


In [3]:
!rm -r *.tgz *.zip sample_data
!ls


rm: cannot remove '*.zip': No such file or directory
rm: cannot remove 'sample_data': No such file or directory
drive  spark-3.1.2-bin-hadoop3.2


In [4]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"


In [5]:
import sys
sys.path.append('/content/drive/My Drive/CS5344/GP')


In [6]:
# output operating time
!pip install ipython-autotime
%load_ext autotime

time: 232 µs (started: 2022-04-02 08:03:59 +00:00)


In [7]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import math
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel, HashingTF, IDF, IDFModel

# spark = SparkSession.builder.master("local[*]").getOrCreate()
spark = SparkSession.builder.appName("Test Setup").getOrCreate()
sc = spark.sparkContext

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
time: 6.73 s (started: 2022-04-02 08:03:59 +00:00)


## Input 2 labeled datasets and combine them together for training baseline model
### Output: 
- label_data (dataframe, column=['id', 'text', 'label'])

In [8]:
# input the first labeled dataset
origin_Fake = pd.read_csv('/content/drive/My Drive/CS5344/GP/Fake.csv')
origin_True = pd.read_csv('/content/drive/My Drive/CS5344/GP/True.csv')
# print(origin_Fake.head())
# print(origin_True.head())

# keep the text features and generate the labels
origin_Fake['text'] = origin_Fake.apply(lambda row: row['title']+" "+row['text'], axis=1)
origin_True['text'] = origin_Fake.apply(lambda row: row[0]+row[1], axis=1)
origin_Fake['label'] = 0 # Fake
origin_True['label'] = 1 # True
label_data1 = pd.DataFrame(columns=['text', 'label'])
label_data1 = label_data1.append(origin_Fake[['text','label']])
label_data1 = label_data1.append(origin_True[['text','label']])

# remove null data
label_data1 = label_data1.dropna()
# print(label_data1.head())
print("# of records of the first text datasets:", len(label_data1))

# of records of the first text datasets: 44898
time: 2.75 s (started: 2022-04-02 08:04:06 +00:00)


In [9]:
# input the second labeled dataset
origin_train = pd.read_csv('/content/drive/My Drive/CS5344/GP/train.csv')
# origin_test = pd.read_csv('/content/drive/My Drive/CS5344/GP/test.csv') # No label
# origin_train = origin_train.append(origin_test)
# print(origin_train)

# keep the text features and remove null data
origin_train = origin_train[['title','text','class']]
origin_train = origin_train.dropna()
origin_train['text'] = origin_train.apply(lambda row: row['title']+" "+row['text'], axis=1)
origin_train = origin_train.replace({'Fake':0, 'Real':1})
origin_train = origin_train.rename(columns={'class':'label'})
label_data2 = origin_train[['text','label']]
# print(label_data2.head())
print("# of records of the second text datasets:", len(label_data2))

# of records of the second text datasets: 40000
time: 2.33 s (started: 2022-04-02 08:04:08 +00:00)


In [10]:
# combine the labeled datasets together
label_data = label_data1.append(label_data2)
label_data['id'] = list(range(len(label_data))) 
label_data = label_data[['id','text','label']]
print(label_data.head())
print("# of records of the labeled text dataset:", len(label_data))

   id                                               text label
0   0   Donald Trump Sends Out Embarrassing New Year’...     0
1   1   Drunk Bragging Trump Staffer Started Russian ...     0
2   2   Sheriff David Clarke Becomes An Internet Joke...     0
3   3   Trump Is So Obsessed He Even Has Obama’s Name...     0
4   4   Pope Francis Just Called Out Donald Trump Dur...     0
# of records of the labeled text dataset: 84898
time: 45.2 ms (started: 2022-04-02 08:04:11 +00:00)


In [11]:
tmp1 = label_data[label_data['label']==1]
tmp0 = label_data[label_data['label']==0]
label_data = pd.concat([tmp1,tmp0],axis=0)
print(label_data.head())
print("# of records of the labeled text dataset:", len(label_data))

      id                                               text label
0  23481   Donald Trump Sends Out Embarrassing New Year’...     1
1  23482   Drunk Bragging Trump Staffer Started Russian ...     1
2  23483   Sheriff David Clarke Becomes An Internet Joke...     1
3  23484   Trump Is So Obsessed He Even Has Obama’s Name...     1
4  23485   Pope Francis Just Called Out Donald Trump Dur...     1
# of records of the labeled text dataset: 84897
time: 35.8 ms (started: 2022-04-02 08:04:11 +00:00)


## Input the unlabeled dataset for self-training
### Output: 
- main_data (dataframe, column=['id', 'text'])

In [12]:
# input the unlabeled dataset
origin_main = pd.read_csv('/content/drive/My Drive/CS5344/GP/fake_news_dataset.csv')
# print(origin_main.head())

# exclude other language
origin_main = origin_main[origin_main['language']=='english'] 


# keep the text features and remove null data
origin_main = origin_main[['uuid','title','text']]
origin_main = origin_main.rename(columns={'uuid':'id'})
origin_main = origin_main.dropna()
origin_main['text'] = origin_main.apply(lambda row: row['title']+" "+row['text'], axis=1)
main_data = origin_main[['id','text']]
# print(main_data['id'].unique()) # the id is unique
print(main_data.head)
print("# of records of the main text dataset:", len(main_data))

<bound method NDFrame.head of                                              id  \
0      6a175f46bcd24d39b3e962ad0f29936721db70db   
1      2bdc29d12605ef9cf3f09f9875040a7113be5d5b   
2      c70e149fdd53de5e61c29281100b9de0ed268bc3   
3      7cf7c15731ac2a116dd7f629bd57ea468ed70284   
4      0206b54719c7e241ffe0ad4315b808290dbe6c0f   
...                                         ...   
12908  613e1a6e130b5a3f5df62c8fb0b73667742a43db   
12909  dedc36a34e5cb1062bf4627d314227f60cd9a708   
12910  cd8bb1ae426287f3a63c2979b3b5dfb0277b10e2   
12911  213eb9eeb5479ad2588b54b24acd53bc8ead8e8c   
12912  795f05dce10c27ff4e7b3f39ddf4e75075f5421c   

                                                    text  
0      Muslims BUSTED: They Stole Millions In Gov’t B...  
1      Re: Why Did Attorney General Loretta Lynch Ple...  
2      BREAKING: Weiner Cooperating With FBI On Hilla...  
3      PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...  
4      FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...  
...

## Input 1 labeled datasets and combine them together for evaluation self-training model
### Output: 
- eval_data (dataframe, column=['id', 'text, 'label''])

In [16]:
# input the first labeled dataset
eval_data = pd.read_csv('/content/drive/My Drive/CS5344/GP/fake_or_real_news.csv')

# keep the text features and generate the labels
eval_data['text'] = eval_data.apply(lambda row: row['title']+" "+row['text'], axis=1)
eval_data['label'] = eval_data.apply(lambda row: 0 if row['label'] == 'FAKE' else 1, axis=1)
eval_data = eval_data.rename(columns={'Unnamed: 0':'id'})
# print(len(eval_data['Unnamed: 0'].unique())) # the id is unique
eval_data = eval_data[['id','text','label']]

eval_data = eval_data.dropna()
print(eval_data)
print("# of records of the evaluation datasets:", len(eval_data))

         id                                               text  label
0      8476  You Can Smell Hillary’s Fear Daniel Greenfield...      0
1     10294  Watch The Exact Moment Paul Ryan Committed Pol...      0
2      3608  Kerry to go to Paris in gesture of sympathy U....      1
3     10142  Bernie supporters on Twitter erupt in anger ag...      0
4       875  The Battle of New York: Why This Primary Matte...      1
...     ...                                                ...    ...
6330   4490  State Department says it can't find emails fro...      1
6331   8062  The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...      0
6332   8622  Anti-Trump Protesters Are Tools of the Oligarc...      0
6333   4021  In Ethiopia, Obama seeks progress on peace, se...      1
6334   4330  Jeb Bush Is Suddenly Attacking Trump. Here's W...      1

[6335 rows x 3 columns]
# of records of the evaluation datasets: 6335
time: 707 ms (started: 2022-04-02 08:07:27 +00:00)


## Data Preprocessing
### Steps:
- convert to lowercase
- remove numbers
- remove puctuations
- remove stopwords
- lemmatization

### Output: 
- label_data_df (spark dataframe, column=['id', 'label', 'token'])
- main_data_df (spark dataframe, column=['id', 'token'])

In [17]:
# load the data into rdd format
label_data_rdd = sc.parallelize(label_data.values.tolist())
main_data_rdd = sc.parallelize(main_data.values.tolist())
eval_data_rdd = sc.parallelize(eval_data.values.tolist())
# print(eval_data_rdd.take(5))

# transformed to lowercase
label_data_lower = label_data_rdd.map(lambda f: ((f[0], f[2]), str.lower(f[1])))
eval_data_lower = eval_data_rdd.map(lambda f: ((f[0], f[2]), str.lower(f[1])))
main_data_lower = main_data_rdd.map(lambda f: (f[0], str.lower(f[1])))
# print(eval_data_lower.take(5))

# remove numbers
label_data_no_number = label_data_lower.map(lambda f: (f[0], re.sub(r'\d+', '', f[1])))
eval_data_no_number = eval_data_lower.map(lambda f: (f[0], re.sub(r'\d+', '', f[1])))
main_data_no_number = main_data_lower.map(lambda f: (f[0], re.sub(r'\d+', '', f[1])))
# print(eval_data_no_number.take(5))

# remove punctuation and split it into words
label_data_words = label_data_no_number.map(lambda f: (f[0], re.split(r'[^\w]+',f[1])))
label_data_words = label_data_words.map(lambda f: (f[0], [i for i in f[1] if i != '']))
eval_data_words = eval_data_no_number.map(lambda f: (f[0], re.split(r'[^\w]+',f[1])))
eval_data_words = eval_data_words.map(lambda f: (f[0], [i for i in f[1] if i != '']))
main_data_words = main_data_no_number.map(lambda f: (f[0], re.split(r'[^\w]+',f[1])))
main_data_words = main_data_words.map(lambda f: (f[0], [i for i in f[1] if i != '']))
print(eval_data_words.take(5))


time: 1.76 s (started: 2022-04-02 08:07:30 +00:00)


In [18]:
# remove stopwords
def remove_stopwords(words, lang='english'):
  from nltk.corpus import stopwords
  lang_stopwords = stopwords.words(lang)
  stopwords_removed = [w for w in words if w not in lang_stopwords]
  return stopwords_removed

label_data_no_stopwords = label_data_words.map(lambda f: (f[0], remove_stopwords(f[1])))
eval_data_no_stopwords = eval_data_words.map(lambda f: (f[0], remove_stopwords(f[1])))
main_data_no_stopwords = main_data_words.map(lambda f: (f[0], remove_stopwords(f[1])))
# print(eval_data_no_stopwords.take(5))

# lemmatization
# Function to find part of speech tag for a word
def find_pos(word):
    # Part of Speech constants
    # ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'

    pos = nltk.pos_tag(nltk.word_tokenize(word))[0][1]
    
    # Adjective tags - 'JJ', 'JJR', 'JJS'
    if pos.lower()[0] == 'j':
        return 'a'
    # Adverb tags - 'RB', 'RBR', 'RBS'
    elif pos.lower()[0] == 'r':
        return 'r'
    # Verb tags - 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'
    elif pos.lower()[0] == 'v':
        return 'v'

    # Noun tags - 'NN', 'NNS', 'NNP', 'NNPS'
    else:
        return 'n'


# Function to apply lemmatization to a list of words
def words_lemmatizer(words, encoding="utf8"):
    lemma_words = []
    wl = WordNetLemmatizer()
    for word in words:
        pos = find_pos(word)
        lemma_words.append(wl.lemmatize(word, pos))
    return lemma_words
    

label_data_lemmatization = label_data_no_stopwords.map(lambda f: (f[0][0], f[0][1], words_lemmatizer(f[1])))
eval_data_lemmatization = eval_data_no_stopwords.map(lambda f: (f[0][0], f[0][1], words_lemmatizer(f[1])))
main_data_lemmatization = main_data_no_stopwords.map(lambda f: (f[0], words_lemmatizer(f[1])))
# print(label_data_lemmatization.take(5))
# print(eval_data_lemmatization.take(5))

time: 36.7 ms (started: 2022-04-02 08:07:47 +00:00)


In [19]:
# generate spark df of labeled dataset
label_data_df = spark.createDataFrame(label_data_lemmatization, ['id', 'label', 'token'])
label_data_df.show()

# generate spark df of evaluation dataset
eval_data_df = spark.createDataFrame(eval_data_lemmatization, ['id', 'label', 'token'])
eval_data_df.show()

# generate spark df of unlabeled dataset
main_data_df = spark.createDataFrame(main_data_lemmatization, ['id', 'token'])
main_data_df.show()

+-----+-----+--------------------+
|   id|label|               token|
+-----+-----+--------------------+
|23481|    1|[donald, trump, s...|
|23482|    1|[drunk, bragging,...|
|23483|    1|[sheriff, david, ...|
|23484|    1|[trump, obsess, e...|
|23485|    1|[pope, francis, c...|
|23486|    1|[racist, alabama,...|
|23487|    1|[fresh, golf, cou...|
|23488|    1|[trump, say, insa...|
|23489|    1|[former, cia, dir...|
|23490|    1|[watch, brand, ne...|
|23491|    1|[papa, john, foun...|
|23492|    1|[watch, paul, rya...|
|23493|    1|[bad, news, trump...|
|23494|    1|[watch, lindsey, ...|
|23495|    1|[heiress, disney,...|
|23496|    1|[tone, deaf, trum...|
|23497|    1|[internet, brutal...|
|23498|    1|[mueller, spokesm...|
|23499|    1|[snl, hilariously...|
|23500|    1|[republican, sena...|
+-----+-----+--------------------+
only showing top 20 rows

+-----+-----+--------------------+
|   id|label|               token|
+-----+-----+--------------------+
| 8476|    0|[smell, hillary,

In [20]:
label_data_lemmatization_string = label_data_lemmatization.map(lambda f: (f[0], f[1], ",".join([i for i in f[2]])))
eval_data_lemmatization_string = eval_data_lemmatization.map(lambda f: (f[0], f[1], ",".join([i for i in f[2]])))
main_data_lemmatization_string = main_data_lemmatization.map(lambda f: (f[0], ",".join([i for i in f[1]])))

time: 4.64 ms (started: 2022-04-02 08:08:27 +00:00)


In [21]:
# generate spark df of labeled dataset
label_data_string_df = spark.createDataFrame(label_data_lemmatization_string, ['id', 'label', 'string'])
label_data_string_df.show()

# generate spark df of evaluation dataset
eval_data_string_df = spark.createDataFrame(eval_data_lemmatization_string, ['id', 'label', 'string'])
eval_data_string_df.show()

# generate spark df of unlabeled dataset
main_data_string_df = spark.createDataFrame(main_data_lemmatization_string, ['id', 'string'])
main_data_string_df.show()

+-----+-----+--------------------+
|   id|label|              string|
+-----+-----+--------------------+
|23481|    1|donald,trump,send...|
|23482|    1|drunk,bragging,tr...|
|23483|    1|sheriff,david,cla...|
|23484|    1|trump,obsess,even...|
|23485|    1|pope,francis,call...|
|23486|    1|racist,alabama,co...|
|23487|    1|fresh,golf,course...|
|23488|    1|trump,say,insanel...|
|23489|    1|former,cia,direct...|
|23490|    1|watch,brand,new,p...|
|23491|    1|papa,john,founder...|
|23492|    1|watch,paul,ryan,t...|
|23493|    1|bad,news,trump,mi...|
|23494|    1|watch,lindsey,gra...|
|23495|    1|heiress,disney,em...|
|23496|    1|tone,deaf,trump,c...|
|23497|    1|internet,brutally...|
|23498|    1|mueller,spokesman...|
|23499|    1|snl,hilariously,m...|
|23500|    1|republican,senato...|
+-----+-----+--------------------+
only showing top 20 rows

+-----+-----+--------------------+
|   id|label|              string|
+-----+-----+--------------------+
| 8476|    0|smell,hillary,fe

In [22]:
# label_data_string_df.write.csv('/content/drive/My Drive/CS5344/GP/label_data_string_df')
eval_data_string_df.write.csv('/content/drive/My Drive/CS5344/GP/eval_data_string_df')
# main_data_string_df.write.csv('/content/drive/My Drive/CS5344/GP/main_data_string_df')

time: 5min 47s (started: 2022-04-02 08:09:03 +00:00)


## CountVector
- train the count vector model (no need to run again)
- save the model (no need to run again)
- read the pre-trained model and generate the count vectors for the datasets

### Output: 
- label_data_cv (spark dataframe, column=['id', 'label', 'token', 'count_vector'])
- main_data_cv (spark dataframe, column=['id', 'token', 'count_vector'])

In [23]:
# # generate the count vector for the labeled dataset
# cv = CountVectorizer(inputCol='token', outputCol='count_vector')
# m_cv = cv.fit(label_data_df)
# label_data_cv = m_cv.transform(label_data_df)
# label_data_cv.show()

time: 1.71 ms (started: 2022-04-02 08:14:50 +00:00)


In [24]:
# # generate the count vector for the unlabeled dataset
# main_data_cv = m_cv.transform(main_data_df)
# main_data_cv.show(8, False)

time: 1.23 ms (started: 2022-04-02 08:14:50 +00:00)


In [25]:
# save the trained countvectorizer model
modelPath = '/content/drive/My Drive/CS5344/GP/count_vectorizer_model'
# m_cv.save(modelPath)

# generate the count vectors directly using the pre-trained model
loadedModel = CountVectorizerModel.load(modelPath)
label_data_cv = loadedModel.transform(label_data_df)
main_data_cv = loadedModel.transform(main_data_df)

label_data_cv.show()
main_data_cv.show()

+-----+-----+--------------------+--------------------+
|   id|label|               token|        count_vector|
+-----+-----+--------------------+--------------------+
|23481|    1|[donald, trump, s...|(100949,[0,1,3,5,...|
|23482|    1|[drunk, bragging,...|(100949,[0,1,2,3,...|
|23483|    1|[sheriff, david, ...|(100949,[0,6,7,8,...|
|23484|    1|[trump, obsess, e...|(100949,[0,1,2,3,...|
|23485|    1|[pope, francis, c...|(100949,[0,1,2,4,...|
|23486|    1|[racist, alabama,...|(100949,[1,5,6,14...|
|23487|    1|[fresh, golf, cou...|(100949,[0,1,3,7,...|
|23488|    1|[trump, say, insa...|(100949,[0,1,3,4,...|
|23489|    1|[former, cia, dir...|(100949,[0,2,3,4,...|
|23490|    1|[watch, brand, ne...|(100949,[0,1,2,3,...|
|23491|    1|[papa, john, foun...|(100949,[0,2,5,6,...|
|23492|    1|[watch, paul, rya...|(100949,[0,1,2,4,...|
|23493|    1|[bad, news, trump...|(100949,[0,1,2,3,...|
|23494|    1|[watch, lindsey, ...|(100949,[0,1,3,9,...|
|23495|    1|[heiress, disney,...|(100949,[0,1,2

In [26]:
cv_vocabulary = loadedModel.vocabulary

time: 848 ms (started: 2022-04-02 08:15:11 +00:00)


In [27]:
cv_vocabulary_df = pd.DataFrame(cv_vocabulary, columns=['vocabulary'])
cv_vocabulary_df[cv_vocabulary_df['vocabulary'].isnull()]

Unnamed: 0,vocabulary


time: 42.5 ms (started: 2022-04-02 08:15:11 +00:00)


In [28]:
cv_vocabulary_df.to_csv('/content/drive/My Drive/CS5344/GP/vocabulary.csv', index = False)

time: 654 ms (started: 2022-04-02 08:15:11 +00:00)


## TF-IDF Vector
### Steps:
- use HashingTF to calculate the TF
- train the TF-IDF vector model (no need to run again)
- save the model (no need to run again)
- read the pre-trained model and generate the TF-IDF vectors for the datasets

### Output: 
- label_data_tf_idf (spark dataframe, column=['id', 'label', 'token', 'tf_vector', 'tf_idf_vector'])
- main_data_tf_idf (spark dataframe, column=['id', 'token', 'tf_vector', 'tf_idf_vector'])

In [29]:
# generate the TF-IDF vector of the labeled dataset 
tf = HashingTF(inputCol='token', outputCol='tf_vector')
label_data_tf = tf.transform(label_data_df)
label_data_tf.show()

# tf_idf = IDF(inputCol='tf_vector', outputCol='tf_idf_vector')
# m_tf_idf = tf_idf.fit(label_data_tf)
# label_data_tf_idf = m_tf_idf.transform(label_data_tf)
# label_data_tf_idf.show()

+-----+-----+--------------------+--------------------+
|   id|label|               token|           tf_vector|
+-----+-----+--------------------+--------------------+
|23481|    1|[donald, trump, s...|(262144,[531,1512...|
|23482|    1|[drunk, bragging,...|(262144,[3148,471...|
|23483|    1|[sheriff, david, ...|(262144,[531,794,...|
|23484|    1|[trump, obsess, e...|(262144,[531,654,...|
|23485|    1|[pope, francis, c...|(262144,[2437,392...|
|23486|    1|[racist, alabama,...|(262144,[1226,425...|
|23487|    1|[fresh, golf, cou...|(262144,[1004,151...|
|23488|    1|[trump, say, insa...|(262144,[2306,594...|
|23489|    1|[former, cia, dir...|(262144,[619,1004...|
|23490|    1|[watch, brand, ne...|(262144,[4214,853...|
|23491|    1|[papa, john, foun...|(262144,[2325,236...|
|23492|    1|[watch, paul, rya...|(262144,[531,1512...|
|23493|    1|[bad, news, trump...|(262144,[1451,160...|
|23494|    1|[watch, lindsey, ...|(262144,[531,1512...|
|23495|    1|[heiress, disney,...|(262144,[531,1

In [30]:
# generate the TF-IDF vector of the unlabeled dataset
main_data_tf = tf.transform(main_data_df)
# main_data_tf_idf = m_tf_idf.transform(main_data_tf)
# main_data_tf_idf.show()

time: 52.6 ms (started: 2022-04-02 08:15:19 +00:00)


In [31]:
# save the trained IDF Model 
modelPath = '/content/drive/My Drive/CS5344/GP/TF_IDF_model'
# m_tf_idf.save(modelPath)

# generate the TF-IDF vectors directly using the pre-trained model
loadedModel = IDFModel.load(modelPath)
label_data_tf_idf = loadedModel.transform(label_data_tf)
main_data_tf_idf = loadedModel.transform(main_data_tf)

label_data_tf_idf.show()
main_data_tf_idf.show()

+-----+-----+--------------------+--------------------+--------------------+
|   id|label|               token|           tf_vector|       tf_idf_vector|
+-----+-----+--------------------+--------------------+--------------------+
|23481|    1|[donald, trump, s...|(262144,[531,1512...|(262144,[531,1512...|
|23482|    1|[drunk, bragging,...|(262144,[3148,471...|(262144,[3148,471...|
|23483|    1|[sheriff, david, ...|(262144,[531,794,...|(262144,[531,794,...|
|23484|    1|[trump, obsess, e...|(262144,[531,654,...|(262144,[531,654,...|
|23485|    1|[pope, francis, c...|(262144,[2437,392...|(262144,[2437,392...|
|23486|    1|[racist, alabama,...|(262144,[1226,425...|(262144,[1226,425...|
|23487|    1|[fresh, golf, cou...|(262144,[1004,151...|(262144,[1004,151...|
|23488|    1|[trump, say, insa...|(262144,[2306,594...|(262144,[2306,594...|
|23489|    1|[former, cia, dir...|(262144,[619,1004...|(262144,[619,1004...|
|23490|    1|[watch, brand, ne...|(262144,[4214,853...|(262144,[4214,853...|

## Standard the Format of the Outputs
### Outputs: 
- BoW 1-gram of the labeled dataset: label_cv_output (spark dataframe, column=['id', 'features', 'label']) 
- BoW 1-gram of the unlabeled dataset: main_cv_output (spark dataframe, column=['id', 'features'])
- TF-IDF of the labeled dataset: label_tfidf_output (spark dataframe, column=['id', 'features', 'label'])
- TF-IDF of the unlabeled dataset: main_tfidf_output (spark dataframe, column=['id', 'features'])

In [32]:
# Output
label_cv_output = label_data_cv.select('id', 'count_vector', 'label')
main_cv_output = main_data_cv.select('id', 'count_vector')
label_tfidf_output = label_data_tf_idf.select('id', 'tf_idf_vector', 'label')
main_tfidf_output = main_data_tf_idf.select('id', 'tf_idf_vector')
label_cv_output = label_cv_output.withColumnRenamed('count_vector', 'features')
main_cv_output = main_cv_output.withColumnRenamed('count_vector', 'features')
label_tfidf_output = label_tfidf_output.withColumnRenamed('tf_idf_vector', 'features')
main_tfidf_output = main_tfidf_output.withColumnRenamed('tf_idf_vector', 'features')

label_cv_output.show()

+-----+--------------------+-----+
|   id|            features|label|
+-----+--------------------+-----+
|23481|(100949,[0,1,3,5,...|    1|
|23482|(100949,[0,1,2,3,...|    1|
|23483|(100949,[0,6,7,8,...|    1|
|23484|(100949,[0,1,2,3,...|    1|
|23485|(100949,[0,1,2,4,...|    1|
|23486|(100949,[1,5,6,14...|    1|
|23487|(100949,[0,1,3,7,...|    1|
|23488|(100949,[0,1,3,4,...|    1|
|23489|(100949,[0,2,3,4,...|    1|
|23490|(100949,[0,1,2,3,...|    1|
|23491|(100949,[0,2,5,6,...|    1|
|23492|(100949,[0,1,2,4,...|    1|
|23493|(100949,[0,1,2,3,...|    1|
|23494|(100949,[0,1,3,9,...|    1|
|23495|(100949,[0,1,2,6,...|    1|
|23496|(100949,[0,1,5,7,...|    1|
|23497|(100949,[0,3,4,6,...|    1|
|23498|(100949,[0,1,3,7,...|    1|
|23499|(100949,[0,1,3,4,...|    1|
|23500|(100949,[0,1,3,5,...|    1|
+-----+--------------------+-----+
only showing top 20 rows

time: 31min 41s (started: 2022-04-02 08:15:46 +00:00)


In [33]:
# from sklearn.feature_extraction.text import CountVectorizer

time: 2.83 ms (started: 2022-04-02 08:47:28 +00:00)


In [34]:
# cv = CountVectorizer(vocabulary = cv_vocabulary)
# label_features_cv = cv.fit_transform(label_data_pd_df['token'])
# main_features_cv = cv.transform(main_data_pd_df['token'])

NameError: ignored

time: 35.2 ms (started: 2022-04-02 08:47:28 +00:00)


In [None]:
# from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
# tf_idf = TfidfTransformer(vocabulary = cv_vocabulary)
# label_features_tfidf = tf_idf.fit_transform(label_data_pd_df['token'])
# main_features_tfidf = tf_idf.transform(main_data_pd_df['token'])