# Requisitos Previos

In [None]:
!pip install pyngrok
!pip install Flask-CORS
!pip install neattext
!pip install emoji
!pip install scikit-learn==1.4.2 
!pip install selenium

!pip install joblib
!pip install nltk

In [None]:
import joblib
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
# from googletrans import Translator

In [None]:
from flask import request, jsonify, Flask
from pyngrok import ngrok, conf
import getpass
import threading

In [None]:
# Funcion axuiliar para saber si estoy en el collab y usar su path o el del proyecto de github
def is_running_on_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False
    
from enum import Enum
# Funcion auxiliar para luego entrenar varios modelos con una sola ejecucion
class Modelos(Enum):
    LOGISTIC_REGRESSION = 'logistic_regression'
    DECISION_TREE = 'decision_tree'
    MULTINOMIAL = 'multinomial'
    BERNOULLI = 'bernoulli'
    GAUSIAN = 'gausian'

# Funcion procesador texto

In [None]:
import neattext.functions as nfx
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
import emoji

wnl = WordNetLemmatizer()

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'


stopwords_en = stopwords.words('english')
stopwords_en = set(stopwords_en).union(set(punctuation))

my_custom_stopwords = {'’', "n't", "'m", "'s", "'ve", '...', 'ca', "''", '``', '\u200d', 'im', 'na', "'ll", '..', 'u', "'re", "'d", '--', '”', '“', '\u200f\u200f\u200e', '....', 'ㅤ','\u200e\u200f\u200f\u200e', 'x200b', 'ive', '.-', '\u200e', '‘'}

stopwords_en = stopwords_en.union(my_custom_stopwords)


def preprocessing_function(text):
    words = []

    for word, tag in pos_tag(word_tokenize(nfx.clean_text(text))):
        word_lemmatized = wnl.lemmatize(word.lower(), pos=penn2morphy(tag))

        if '\u200b' in word_lemmatized:
            continue

        if word_lemmatized not in stopwords_en and not word_lemmatized.isdigit() and not emoji.purely_emoji(word_lemmatized):
            words.append(word_lemmatized)

    return words

# Carga modelo entrenado

In [None]:
# MODIFICAR ESTOS PARAMETROS PARA LA CARGA
# --------------------------------------------------------
nombre_modelo_prev_entrenado = Modelos.LOGISTIC_REGRESSION.value
# usar formato '25k' para 25.000 filas ejemplo
cant_prev_entrenada = '50k'

path_base_modelo_generado = '/content/' if is_running_on_colab() else '.\\tentativa_suicidio\\entrenados\\'
path_modelo_generado = path_base_modelo_generado + nombre_modelo_prev_entrenado + '_' + cant_prev_entrenada
# --------------------------------------------------------

model = joblib.load(path_modelo_generado + '_model.pkl')
vect = joblib.load(path_modelo_generado + '_vector.pkl')

print(type(vect))
print(type(model))

# Funcion predict

In [None]:
# translator = Translator()

def get_tentativa_suicidio(text_input, english_text=False):
    texto_a_analizar = text_input #if english_text else translator.translate(text_input, dest='en').text
        
    texto_preprocesado = ' '.join(preprocessing_function(texto_a_analizar))
    texto_vectorizado = vect.transform([texto_preprocesado])

    return bool(model.predict(texto_vectorizado)[0])

# API

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import re

def get_texto(url):
    options = webdriver.FirefoxOptions()
    options.add_argument('--headless')
    driver = webdriver.Firefox(options=options)
    driver.get(url)
    
    buttons = driver.find_elements(By.XPATH, "//button[contains(@id, '-read-more-button')]")
    
    # Filtrar los botones que coinciden con la expresión regular
    for button in buttons:
        if re.search(r"-read-more-button$", button.get_attribute("id")):
            button.click()
            break

    time.sleep(0.5)

    button_id = button.get_attribute("id")
    div_id_pattern = re.sub(r"-read-more-button$", "-post-rtjson-content", button_id)
    texto_element = driver.find_element(By.XPATH, f"//div[contains(@id, '{div_id_pattern}')]")

    driver.close()

    return texto_element

In [None]:
from flask_cors import CORS

conf.get_default().auth_token = getpass.getpass()

app = Flask(__name__)

CORS(app)

public_url = ngrok.connect(5000).public_url
print(" * ngrok tunnel \"{}\" -> \"http://127.0.0.1:{}/\"".format(public_url, 5000))

@app.route("/")
def home():
    return "<p>API para predecir tentativas de suicidio en textos.</p>"

@app.route('/test', methods=['GET'])
def test():
    return jsonify({"message": "FUnciona"})


@app.route('/output', methods=['GET'])
def predJson():
    text_input = request.args.get('text')
    if text_input:
        prediction = get_tentativa_suicidio(text_input)
        return jsonify({"prediction": prediction})
    else:
        return jsonify({"error": "Please provide 'text' parameter."})
    
@app.route("/analizar_url", methods=["POST"])
def procesar_url():
    data = request.json
    url = data.get("url")
    if url:
        print("URL recibida:", url)
        texto = get_texto(url)
        prediction = get_tentativa_suicidio(texto)

        return jsonify({"message": prediction})
    else:
        return jsonify({"error": "No se proporcionó la URL en la solicitud"})

threading.Thread(target=app.run, kwargs={"use_reloader": False}).start()