In [1]:
import os 
import flask
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions
from pyspark.sql.functions import col, udf
from engine import TextClassificationEngine, TopicModellingModel
from utilities import (
    send_request_reddit_get_new_post, 
    get_subtopic_top_word,
    get_most_popular_topic,
    get_background_image
)

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
spark = SparkSession.builder \
           .appName('Web Appp') \
           .config("spark.jars", "mysql-connector-j-8.0.32.jar")\
           .config("spark.driver.memory", "6g") \
           .config("spark.executor.memory", "8g") \
           .getOrCreate()

In [3]:
path = os.getcwd()

In [4]:
label_engine = TextClassificationEngine(spark)

INFO:engine:Starting up text classification engine: 
INFO:engine:Loading labled data...
INFO:engine:Loading completed
INFO:engine:Preprocessing data...
INFO:engine:Preprocessing completed
INFO:engine:Vectorize data...
INFO:engine:Vectorization completed
INFO:engine:Training text classification model...
INFO:engine:Text classification model built!


In [5]:
business_engine = TopicModellingModel(spark, 'Business', 3)

INFO:engine:Starting up model LDA Business: 
INFO:engine:Loading data...
INFO:engine:Loading completed
INFO:engine:Preprocessing data...
INFO:engine:Preprocessing completed
INFO:engine:LDA Business model built!


In [6]:
tech_engine = TopicModellingModel(spark, 'Sci/Tech', 4)

INFO:engine:Starting up model LDA Business: 
INFO:engine:Loading data...
INFO:engine:Loading completed
INFO:engine:Preprocessing data...
INFO:engine:Preprocessing completed
INFO:engine:LDA Business model built!


In [7]:
sports_engine = TopicModellingModel(spark, 'Sports', 4)

INFO:engine:Starting up model LDA Business: 
INFO:engine:Loading data...
INFO:engine:Loading completed
INFO:engine:Preprocessing data...
INFO:engine:Preprocessing completed
INFO:engine:LDA Business model built!


In [8]:
world_engine = TopicModellingModel(spark, 'World', 3)

INFO:engine:Starting up model LDA Business: 
INFO:engine:Loading data...
INFO:engine:Loading completed
INFO:engine:Preprocessing data...
INFO:engine:Preprocessing completed
INFO:engine:LDA Business model built!


In [9]:
def choose_engine(label_name): 
    if label_name == 'Business' :
        return business_engine
    elif label_name == 'Sci/Tech' :
        return tech_engine
    elif label_name == 'Sports' : 
        return sports_engine
    else : 
        return world_engine

In [42]:
app = flask.Flask(__name__, template_folder = os.path.join(path, 'template'))
@app.route('/', methods = ['GET', 'POST'])
def home():
    if flask.request.method == 'GET':
        return flask.render_template('home.html')
            
    if flask.request.method == 'POST':
        username = flask.request.form['user_id']
        url = f'https://oauth.reddit.com/user/{username}/submitted'
        response, status = send_request_reddit_get_new_post(url)
        total_post = pd.DataFrame(columns = ['post_id', 'descriptions'])
        for post in response['data']['children']:
            post_id = post['data']['id']
            descriptions = post['data']['title']
            total_post_aux = pd.DataFrame({'post_id': [post_id], 'descriptions': [descriptions]})
            total_post = pd.concat([total_post_aux, total_post], ignore_index = True, axis = 0)
            
        predicted_label = label_engine.predict_label(total_post)
        grouped_label = predicted_label.groupBy("label_name").count()
        label_name = get_most_popular_topic(grouped_label, 'label_name')
        considered_post = predicted_label.filter(col('label_name') == label_name)
        
        topic_engine = choose_engine(label_name)

        predicted_topic = topic_engine.predict_topic(considered_post)
        grouped_topic = predicted_topic.groupBy("topic").count()
        topic = get_most_popular_topic(grouped_topic, 'topic')
        
        results = topic_engine.get_recommendation(topic).collect()
        subtopics = get_subtopic_top_word(topic_engine)
            
        recommendations = []
        for result in results: 
            recommendation = []
            descriptions = result[3]
            created_utc = str(result[4])
            source_url = result[5]
            post_url = result[6]
            background_image = get_background_image(source_url)
            recommendation.append(descriptions)
            recommendation.append(created_utc)
            recommendation.append(source_url)
            recommendation.append(post_url)
            recommendation.append(background_image)
            recommendations.append(recommendation)
        return flask.render_template('user_result.html', username = username, category = result[7], subtopic = ', '.join(subtopics[result[0]]), recommendations = recommendations)
    
@app.route('/test', methods=['GET', 'POST'])
def test():
    if flask.request.method == 'GET':
        return flask.render_template('test_model.html')
    
    if flask.request.method == 'POST':
        sentence = flask.request.form['inputValue']
        input_df = [('1', sentence)]
        predicted_label = label_engine.predict_label(input_df).select('label_name').collect()[0][0]
        return flask.render_template('test_model.html', result = predicted_label)

if __name__ == '__main__':
    app.run(host = "0.0.0.0")

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.21.0.3:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:172.21.0.1 - - [09/Jun/2023 02:10:41] "GET / HTTP/1.1" 200 -
  for column, series in pdf.iteritems():
INFO:werkzeug:172.21.0.1 - - [09/Jun/2023 02:11:03] "POST / HTTP/1.1" 200 -
INFO:py4j.clientserver:Closing down clientserver connection
INFO:werkzeug:172.21.0.1 - - [09/Jun/2023 02:11:03] "[33mGET /None HTTP/1.1[0m" 404 -
INFO:werkzeug:172.21.0.1 - - [09/Jun/2023 02:11:28] "GET / HTTP/1.1" 200 -
INFO:werkzeug:172.21.0.1 - - [09/Jun/2023 02:11:30] "GET /test HTTP/1.1" 200 -
INFO:werkzeug:172.21.0.1 - - [09/Jun/2023 02:11:34] "POST /test HTTP/1.1" 200 -
INFO:py4j.clientserver:Closing down clientserver connection
INFO:werkzeug:172.21.0.1 - - [09/Jun/2023 02:11:37] "GET / HTTP/1.1" 200 -
