# Librerias

In [None]:
# trabajar con datos tabulares
import pandas as pd
# nlp
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('word2vec_sample')
# guardado del modelo entranado
import joblib
# eliminar warning del replace
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# traducir
from googletrans import Translator
# emojis
import emoji
# enum
from enum import Enum

def is_running_on_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

class Modelos(Enum):
    LOGISTIC_REGRESSION = 'logistic_regression'
    DECISION_TREE = 'decision_tree'
    MULTINOMIAL = 'multinomial'
    BERNOULLI = 'bernoulli'
    GAUSIAN = 'gausian'

def select_model_to_train(model_name):
    models = {
        Modelos.LOGISTIC_REGRESSION.value: LogisticRegression(max_iter=1000),
        Modelos.DECISION_TREE.value : DecisionTreeClassifier(),
        Modelos.MULTINOMIAL.value : MultinomialNB(),
        Modelos.BERNOULLI.value: BernoulliNB(),
        Modelos.GAUSIAN.value: GaussianNB()
    }
    if model_name in models:
        return models[model_name]
    else:
        raise ValueError(f"Modelo '{model_name}' no válido")

# Funcion procesador texto

In [None]:
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation

wnl = WordNetLemmatizer()

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'


stopwords_en = stopwords.words('english')
stopwords_en = set(stopwords_en).union(set(punctuation))

my_custom_stopwords = {'’', "n't", "'m", "'s", "'ve", '...', 'ca', "''", '``', '\u200d', 'im', 'na', "'ll", '..', 'u', "'re", "'d", '--', '”', '“', '\u200f\u200f\u200e', '....', 'ㅤ','\u200e\u200f\u200f\u200e', 'x200b', 'ive', '.-', '\u200e', '‘'}

stopwords_en = stopwords_en.union(my_custom_stopwords)


def preprocessing_function(text):
    words = []

    for word, tag in pos_tag(word_tokenize(text)):
        word_lemmatized = wnl.lemmatize(word.lower(), pos=penn2morphy(tag))

        if '\u200b' in word_lemmatized:
            continue

        if word_lemmatized not in stopwords_en and not word_lemmatized.isdigit() and not emoji.purely_emoji(word_lemmatized):
            words.append(word_lemmatized)

    return words

# Funcion test

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

def test(clf, nombreModelo):
  clf.fit(X_train_vectorized.toarray(), y_train)
  y_pred = clf.predict(X_test_vectorized.toarray())

  accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)

  disp = ConfusionMatrixDisplay.from_estimator(
        clf, X_test_vectorized.toarray(), y_test,  xticks_rotation="vertical"
   )
  
  plt.title(nombreModelo + " - Accuracy: {:.2f}".format(accuracy))

  # disp.plot()
  plt.show()

  return disp

# Importacion dataset

In [None]:
cant_importada = 50000
path_base_dataset = '/content/' if is_running_on_colab() else '..\\datasets\\'
path_dataset =  path_base_dataset + 'Suicide_Detection.csv'

dataframe = pd.read_csv(path_dataset, nrows=cant_importada)

print(dataframe)

O lo descargo de internet

In [None]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# jquiros/suicide el que usamos todo el tiempo
dataset = load_dataset("vibhorag101/suicide_prediction_dataset_phr", "default")

dataset = None

print(dataframe)

# Checkeo y formateo del dataset

In [None]:
dataframe['class'].value_counts()

In [None]:
# Este caso puntual el csv la primera columna es el indice que no nos interesa, si quiero eliminarla por el nombre que le asigna pandas
try:
    dataframe = dataframe.drop('Unnamed: 0', axis=1)
except:
    print("No se elimino columna Unnamed: 0")

# o eliminarla por la posicion
# dataframe = dataframe.drop(dataframe.columns[0], axis=1)

# Paso a booleano la clasificacion
dataframe['class'] = dataframe['class'].replace({"suicide": True, "non-suicide": False})

dataframe.head()

In [None]:
import matplotlib.pyplot as plt

# Obtener los conteos de las clases y sus respectivos índices
class_counts = dataframe['class'].value_counts()
class_indices = class_counts.index.values

# Crear el gráfico de barras
plt.bar(class_indices, class_counts, color=['blue', 'green'])

# Añadir etiquetas y título
plt.xlabel('suicide - non suicide')
plt.ylabel('Cantidad')
plt.title("IA APLICADA")

# Mostrar el gráfico
plt.show()


# Entrenamiento y guardado

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(dataframe["text"],
                                                    dataframe["class"],
                                                    test_size=0.20, random_state=0,
                                                    stratify=dataframe["class"])

len(y_train), len(y_test)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# ----------------------------------------------------------------------
#BoW con vectores binarios. Estos se usaban en tareas de analisis de sentimientos que no necesita saber la cantidad de veces que se repite una palabra sino su mera presencia.
count_vectorizer = CountVectorizer(analyzer=preprocessing_function, binary=True)

# La idea es dado un texto hay que considerarlo una colección o bolsa (Bag) de palabras ignorando el orden y contexto.
# count_vectorizer = CountVectorizer(analyzer=preprocessing_function)

#Term Frecuency - Inverse Document Frecuency trata este tema calculando la importancia de una palabra en base a las otras en el documento y en el corpus.
# count_vectorizer = TfidfVectorizer(analyzer=preprocessing_function)

# ----------------------------------------------------------------------

X_train_vectorized = count_vectorizer.fit_transform(X_train)
X_test_vectorized = count_vectorizer.transform(X_test)

In [None]:
modelos_disponibles = {
    Modelos.LOGISTIC_REGRESSION.value : True,
    Modelos.DECISION_TREE.value: True,
    Modelos.MULTINOMIAL.value: True,
    Modelos.BERNOULLI.value: True,
    Modelos.GAUSIAN.value: True,
}

cant_entrenada_str = "{:.0f}".format(cant_importada/1000) + 'k'

for nombreModelo, entrenar in modelos_disponibles.items():
    if entrenar:
        modelToTrain = select_model_to_train(nombreModelo)
        test(modelToTrain, nombreModelo)

        path_base_modelo = '/content/' if is_running_on_colab() else '..\\entrenados\\'
        path_modelo = path_base_modelo + nombreModelo + '_' + cant_entrenada_str
        joblib.dump(modelToTrain, path_modelo +  '_model.pkl')
        joblib.dump(count_vectorizer, path_modelo + '_vector.pkl')
        print(f"Modelo {nombreModelo} guardado en " + path_modelo)
        modelToTrain = None
    else:
        print(f"Modelo {nombreModelo} no entrenado")
    print("--------------------------------")

# Casos de uso

## Cargar modelo previamente generado

In [None]:
nombre_modelo_prev_entrenado = Modelos.LOGISTIC_REGRESSION.value
# usar formato '25k' para 25.000 filas ejemplo
cant_prev_entrenada = '10k'

path_base_modelo_generado = '/content/' if is_running_on_colab() else '..\\entrenados\\'
path_modelo_generado = path_base_modelo_generado + nombre_modelo_prev_entrenado + '_' + cant_prev_entrenada

loaded_model = joblib.load(path_modelo_generado + '_model.pkl')
loaded_count_vectorizer = joblib.load(path_modelo_generado + '_vector.pkl')

vect = loaded_count_vectorizer
model = loaded_model

print(type(vect))
print(type(model))
print(model.n_features_in_)


## Probar con dataset

In [None]:

path_base_dataset = '/content/' if is_running_on_colab() else '..\\datasets\\'
path_dataset =  path_base_dataset + 'Suicide_Detection.csv'

cant_a_probar = 5000

dataframeTest = pd.read_csv(path_dataset, skiprows=40000, nrows=cant_a_probar)
dataframeTest.columns = ["borrar","text","class"]
dataframeTest = dataframeTest.drop("borrar", axis=1)
dataframeTest['class'] = dataframeTest['class'].replace({"suicide": True, "non-suicide": False})

print(dataframeTest)

In [None]:
counter = 0

for index, row in dataframeTest.iterrows():
    texto = row['text']
    
    # translator = Translator()
    # traduccion = translator.translate(texto, dest='es').text

    texto_preprocesado = preprocessing_function(texto)
    texto_preprocesado_str = ' '.join(texto_preprocesado)

    texto_vectorizado = vect.transform([texto_preprocesado_str])

    prediccion = model.predict(texto_vectorizado)

    clase_real = row['class']
    resultado_prediccion = 'suicida' if prediccion else 'no suicida'
    resultado_real = 'suicida' if clase_real else 'no suicida'

    if resultado_real != resultado_prediccion :
        counter = counter + 1
    #     print("----------------")
    #     print(texto)
    #     print("Dato dataset " + resultado_real)
    #     print("Dato prediccion " + resultado_prediccion)

print("La cantidad de casos donde no coincidio la prediccion con la clasificacion real del dataset: " + str(counter) + " - " + str("{:.2f}".format(cant_a_probar / counter) if counter != 0 else "no hubo errores") + "%")


## Probar con nuestro datos

In [None]:
textos_prueba = [
                "I want to jump from a bridge",
                "I want to suicide me",
                 "I hate my parents with all my heart",
                 "I hate all about this life",
                 "I cry every night",
                 "I don't know what is happen to me, but I don't want live anymore"
                 ]

translator = Translator()

for texto in textos_prueba:
    traduccion = translator.translate(texto, dest='es').text

    texto_preprocesado = preprocessing_function(texto)
    texto_preprocesado_str = ' '.join(texto_preprocesado)
    print(texto_preprocesado_str)

    # decision tree
    texto_vectorizado = vect.transform([texto_preprocesado_str])

    prediccion = model.predict(texto_vectorizado)

    print(f"'{texto}'\n'{traduccion}'\n{'suicida' if prediccion else 'no suicida'}\n\n")

# Verificar ocurrencia de palabras

## Palabras mas comunes contando palabra por palabra

In [None]:
# Intuición principal en este tipo de tecnicas: Contar las ocurrencias de las palabras.
from collections import defaultdict, Counter
from tqdm import tqdm

# Instanciamos un contador de python
word_counts = Counter()

for row in tqdm(dataframe.iterrows(), total=len(dataframe)):
    text = row[1]

    # Usamos la funcion implementada en pandas split() para separar palabras por espacios en blanco.
    for word in text.str.split()[0]:
        word_counts[word] += 1

len(word_counts)

In [None]:
word_counts.most_common(25)

## Palabras mas comunes contando con tokenizador

In [None]:
from nltk import word_tokenize

word_counts = Counter()

for row in tqdm(dataframe.iterrows(), total=len(dataframe)):
    text = row[1].iat[0]
    for word in word_tokenize(text):
        word_counts[word] += 1

len(word_counts)

In [None]:
word_counts.most_common(25)

In [None]:
print(("hello, how are you?").split())

In [None]:
print(word_tokenize("hello, how are you?"))

## Palabras mas comunes contando teniendo en cuenta stopwords

In [None]:
stopwords_en = stopwords.words('english')
print(stopwords_en)

In [None]:
from nltk.corpus import stopwords

stopwords_en = stopwords.words('english')

word_counts = Counter()

for row in tqdm(dataframe.iterrows(), total=len(dataframe)):
    text = row[1].iat[0]

    for word in word_tokenize(text):
        word_lowercase = word.lower()

        if word_lowercase not in stopwords_en:
            word_counts[word_lowercase] += 1

len(word_counts)

In [None]:
word_counts.most_common(10)

## Palabras mas comunes contando teniendo en cuenta signos de puntuacion

In [None]:
from string import punctuation

# Hacemos una union entre conjunto de caracteres de puntuacion nativos a nuestro conjunto de stopwords usando la operation union de sets de datos.
stopwords_en = set(stopwords_en).union(set(punctuation))

word_counts = Counter()

for row in tqdm(dataframe.iterrows(), total=len(dataframe)):
    text = row[1].iat[0]

    for word in word_tokenize(text):
        word_lowercase = word.lower()

        if word_lowercase not in stopwords_en:
            word_counts[word_lowercase] += 1

len(word_counts)

In [None]:
word_counts.most_common(10)

## Stemming

In [None]:
# Usamos Stemming
from nltk.stem import PorterStemmer

porter = PorterStemmer()

word_counts = Counter()

for row in tqdm(dataframe.iterrows(), total=len(dataframe)):
    text = row[1].iat[0]

    for word in word_tokenize(text):
        word_lowercase = word.lower()

        if word_lowercase not in stopwords_en:
            stemmed_word = porter.stem(word_lowercase)
            word_counts[stemmed_word] += 1

len(word_counts)

In [None]:
word_counts.most_common(10)

## Lematization

In [None]:
#Usemos Lemmatization:
from nltk import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer


wnl = WordNetLemmatizer()

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'

word_counts = Counter()

for row in tqdm(dataframe.iterrows(), total=len(dataframe)):
    text = row[1].iat[0]

    for word, tag in pos_tag(word_tokenize(text)):
        word_lemmatized = wnl.lemmatize(word.lower(), pos=penn2morphy(tag))

        if word_lemmatized not in stopwords_en:
            word_counts[word_lemmatized] += 1

len(word_counts)

## Palabras mas comunes segun mi funcion analizadora

In [None]:
#Usemos Lemmatization:
from collections import Counter
from tqdm import tqdm
from nltk import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

word_counts = Counter()

for row in tqdm(dataframe.iterrows(), total=len(dataframe)):
    text = row[1].iat[0]

    for word, tag in pos_tag(word_tokenize(text)):
        word_lemmatized = wnl.lemmatize(word.lower(), pos=penn2morphy(tag))

        if '\u200b' in word_lemmatized:
            continue

        if word_lemmatized not in stopwords_en and not word_lemmatized.isdigit() and not emoji.purely_emoji(word_lemmatized):
            word_counts[word_lemmatized] += 1

len(word_counts)

In [None]:
word_counts.most_common(1000)

# Entrenamientos por clasificador

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
test(dtc)

### MultinomialBN

In [None]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
test(mnb)

### BernoulliNB

In [None]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
test(bnb)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
# lgr = LogisticRegression()
lgr = LogisticRegression(max_iter=1000)
test(lgr)

### Gaussian

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
test(gnb)

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=42)
test(clf)