In [0]:
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Window
import sys
import os

In [0]:
spark = SparkSession.builder.getOrCreate()

In [0]:
rank = spark.read.parquet('dbfs:/mnt/lsde/group24/subreddit_rank_2')
sub_id_list = rank.select(col('subreddit_id')).orderBy(asc('subreddit_id')).rdd.map(lambda r: r.subreddit_id).collect()

In [0]:
all_sub_files_list = os.listdir('/dbfs/mnt/lsde/group24/submissions')
file_path = 'dbfs:/mnt/lsde/group24/submissions/'
#This function is to select the subreddit_id in the top 1000 titles
#This function is to union all titles with subreddit_id
def select_title():
    schema_title = 'subreddit_id string, title string, score long'
    result = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema_title)
    for sub_file in all_sub_files_list:
        #print(file_path + sub_file)
        id_with_title = spark.read.parquet(file_path + sub_file).select(col('subreddit_id'),col('title'),col('score')) \
                            .filter(col('subreddit_id').isin(sub_id_list))
        result = result.union(id_with_title)
        #id_with_title_list.append(id_with_title)
    return result.orderBy(asc('subreddit_id'),desc('score'))
titles_with_subrid = select_title()

In [0]:
#
window = Window.partitionBy('subreddit_id').orderBy(desc('score'))
titles_with_subrid = titles_with_subrid.withColumn('score_rank', row_number().over(window))

In [0]:
#select title which score is in top 1000
top_subreddit =  titles_with_subrid.select(col('subreddit_id'),col('title')) \
                                .filter(col('score_rank') <= 1000) \
                                .groupBy('subreddit_id') \
                                .agg(collect_list('title').alias('titles'))
write_path = 'dbfs:/mnt/lsde/group24/'
top_subreddit.write.format("parquet").mode("overwrite").save(write_path + "/top_subreddit")

In [0]:
titles_id = spark.read.parquet('dbfs:/mnt/lsde/group24/top_subreddit')

In [0]:
titles_list = titles_id.rdd.collect()

In [0]:
import emoji
import re
import RAKE
import operator
import jieba
import jieba.analyse
import collections

In [0]:
def rm_emoji(sentence):
    #rm_emoji = re.compile(u'[\U00010000-\U0010ffff]')
    #rm_emoji.sub('', sentence)
    sentence = emoji.demojize(sentence)
    sentence = re.sub(':\S+?:', ' ', sentence)
    return sentence
def rm_url(sentence):
    s = sentence.split(' ')
    # remove http://...
    url_pattern = re.compile(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%|\-)*\b', re.S)
    # romove without http:// ....
    domain_pattern = re.compile(r'(\b)*(.*?)\.(com|cn)')
    if len(s) > 0:
        result = []
        for item in s:
            s = re.sub(url_pattern, '', item)
            s = re.sub(domain_pattern,'', s)
            result.append(s)
        return ' '.join(result)
    else:
        return re.sub(url_pattern, '', s)
def rm_html_tag(sentence):
    html_pattern = re.compile('(<a|<b)(.*?)(</a>|</b>)', re.S)
    sentence=re.sub(html_pattern,'', sentence)
    return sentence
def rm_tag(sentence):
    tag_pattern = re.compile('(\[|\#|【)(.*?)(\#|\]|\】)', re.S)
    sentence = re.sub(tag_pattern, '', sentence)
    return sentence.strip()
def rm_at(sentence):
    at_pattern = re.compile('@\S*', re.S)
    sentence = re.sub(at_pattern, '', sentence)
    return sentence.strip()
def rm_other(sentence):
    sentence.replace('\n', '')
    sentence.replace('\r', '')
    sentence.replace('#', '')
    sentence.replace('*', '')
    return sentence.strip()
def clean_sentence(s):
    s = rm_emoji(s)
    s = rm_url(s)
    s = rm_html_tag(s)
    s = rm_tag(s)
    s = rm_at(s)
    s = rm_other(s)
    #s = re.sub(r'。|？|！', '. ', s)
    return s
def clean_text(text):
    new_text_str = ''
    for sentence in text:
        s = clean_sentence(sentence)
        new_text_str = new_text_str + ' ' + s
    return new_text_str

In [0]:
stop_dir = '/dbfs/FileStore/Group24/SmartStoplist.txt'
stop_dir_cn = '/dbfs/FileStore/Group24/cn_stopwords.txt'
rake_object = RAKE.Rake(stop_dir)
jieba.analyse.set_stop_words(stop_dir_cn)

In [0]:
def sort_tuple(tup):
    tup.sort(key = lambda x: x[1])
    return tup

In [0]:
def run_rake(text):
    #print('raking')
    keywords = sort_tuple(rake_object.run(text))[-15: ]
    return keywords

In [0]:
def run_rake_cn(text):
    words = jieba.analyse.textrank(text, topK=20,withWeight=True)
    key_words = sort_tuple(words)[-15:]
    return key_words

In [0]:
#titles_list
kw_id_list = []
def extract_topic():
    for titles in titles_list:
        new_text = clean_text(titles[1])
        if titles[0] == 't5_2qq6z':
            keywords = [('hmmm', 10.0)]
        elif titles[0] == 't5_37k29':
            keywords = [('Ich_iel', 10.0)]
        elif titles[0] == 't5_38e1l':
            keywords = [('Maybe maybe maybe', 10.0)]
        elif titles[0] == 't5_x72uq':
            keywords = run_rake_cn(new_text)
            print(keywords)
        else:
            keywords = run_rake(new_text)
        tup = (titles[0], keywords)
        print(titles[0] + ' finished')
        kw_id_list.append(tup)

In [0]:
extract_topic()

t5_2qh13 finished
t5_2qhfg finished
t5_2qhr7 finished
t5_2qi58 finished
t5_2s6v6 finished
t5_2wm0g finished
t5_37xo2 finished
t5_3ec9d finished
t5_3f1iq finished
t5_3icow finished
t5_3k1kj finished
t5_2qhu8 finished
t5_2qioo finished
t5_2qzb6 finished
t5_2r2jt finished
t5_2sxhs finished
t5_2t3ad finished
t5_2tycb finished
t5_2v0p0 finished
t5_38ipe finished
t5_3l4bg9 finished
t5_kltit finished
t5_2qhh9 finished
t5_2qjdm finished
t5_2reak finished
t5_2s837 finished
t5_2t9mw finished
t5_2v0c6 finished
t5_2xtuc finished
t5_33rc6 finished
t5_3k7ez finished
t5_42kuwz finished
t5_21turx finished
t5_2r0cn finished
t5_2r0gj finished
t5_2rawz finished
t5_2rx57 finished
t5_2tpjq finished
t5_2u05j finished
t5_2v94d finished
t5_3b749 finished
t5_2eni6u finished
t5_2qgzt finished
t5_2qtwb finished
t5_2rq8n finished
t5_2skiq finished
t5_2v23y finished
t5_2v2cd finished
t5_2w2f5 finished
t5_2yzi6 finished
t5_3adlm finished
t5_121sso finished
t5_2hy5hz finished
t5_2qlve finished
t5_2rske finished
t5_2

In [0]:
sc = SparkContext.getOrCreate()
kw_id = sc.parallelize(kw_id_list).toDF()

In [0]:
kw_id = kw_id.select(col('_1').alias('subreddit_id'),col('_2').alias('kw_list'))

In [0]:
sub_kw = rank.join(kw_id, 'subreddit_id')

In [0]:
write_path = 'dbfs:/mnt/lsde/group24/'
sub_kw.write.format("parquet").mode("overwrite").save(write_path + "/subreddit_topic_2")

In [0]:
rank.coalesce(1).write.mode("overwrite") \
    .option("mapreduce.fileoutputcommitter.marksuccessfuljobs","false") \
    .option("header","true") \
    .option("delimiter", "|") \
    .csv('dbfs:/FileStore/Group24/subreddit_rank.csv')