# Librerias

In [None]:
# trabajar con datos tabulares
import pandas as pd
# nlp
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('word2vec_sample')
# guardado del modelo entranado
import pickle
import joblib
# eliminar warning del replace
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# traducir
from googletrans import Translator
# emojis
import emoji

# Pasos

## Importacion dataset

In [None]:
# Ruta del archivo CSV
file_path = '..\\datasets\\Suicide_Detection.csv'

# Leer el archivo CSV en un DataFrame de Pandas, 
# dataframe = pd.read_csv(file_path)
# si quiero limitar la cantidad a importar
dataframe = pd.read_csv(file_path, nrows=25000)

# Este caso puntual el csv la primera columna es el indice que no nos interesa, si quiero eliminarla por el nombre que le asigna pandas
# dataframe = dataframe.drop('Unnamed: 0', axis=1)
# o eliminarla por la posicion
dataframe = dataframe.drop(dataframe.columns[0], axis=1)

# Paso a booleano la clasificacion
dataframe['class'] = dataframe['class'].replace({"suicide": True, "non-suicide": False})

print(dataframe)

## Entrenamiento

### Funcion analizadora

In [None]:
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation

# lematization
wnl = WordNetLemmatizer()

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'


# Agrego a stopwords signos de puntuacion y emojis
stopwords_en = stopwords.words('english')
stopwords_en = set(stopwords_en).union(set(punctuation))
# stopwords_en = set(stopwords_en).union(set(emoji.UNICODE_EMOJI['en']))

# Defino la funcion
def preprocessing_function(text):
    words = []

    for word, tag in pos_tag(word_tokenize(text)):
        word_lemmatized = wnl.lemmatize(word.lower(), pos=penn2morphy(tag))

        if word_lemmatized not in stopwords_en and not word_lemmatized.isdigit() and not emoji.purely_emoji(word_lemmatized):
            print(word_lemmatized)
            words.append(word_lemmatized)

    return words

### Entrenamiento con bolsa de palabras

In [None]:
# Hacemos un split de sets de train y test
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(dataframe["text"],
                                                    dataframe["class"],
                                                    test_size=0.15, random_state=0,
                                                    stratify=dataframe["class"])

len(y_train), len(y_test)

In [None]:
# Sklearn tiene un objeto llamado CountVectorizer que nos permite pasarle un "analyzer"
# El "analyzer" toma el texto que le pasamos y devuelve una lista de palabras a contar.

from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(analyzer=preprocessing_function)

In [None]:
# Entrenamos nuestro CountVectorizer en el training set and transformamos ambos datasets
X_train_vectorized = count_vectorizer.fit_transform(X_train)
X_test_vectorized = count_vectorizer.transform(X_test)

## Testing

### Funcion test

In [None]:
#Vamos a definir nuestra funcion de test y graficar nuestra confusion matrix.
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def test(clf):
  clf.fit(X_train_vectorized.toarray(), y_train)
  y_pred = clf.predict(X_test_vectorized.toarray())

  print(f"accuracy: {accuracy_score(y_true=y_test, y_pred=y_pred)}")

  return ConfusionMatrixDisplay.from_estimator(
      clf, X_test_vectorized.toarray(), y_test,  xticks_rotation="vertical"
  )

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
test(clf)

### Guardado

Una version

In [None]:
import joblib

joblib.dump(clf, '..\\entrenados\\decision_tree\\joblib\\decision_tree_model_25k.pkl')
joblib.dump(count_vectorizer, '..\\entrenados\\decision_tree\\joblib\\decision_tree_vector_25k.pkl')

El que teniamos

In [None]:
pickle.dump(clf, open('..\\entrenados\\decision_tree\\dump\\decision_tree_model_25k.pkl', 'wb'))
pickle.dump(count_vectorizer, open('..\\entrenados\\decision_tree\\dump\\decision_tree_vector_25k.pkl', 'wb'))

### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
test(clf)

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=42)
test(clf)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()
test(lg)

## Casos de uso

### Si uso el que recien genere

In [None]:
vect = count_vectorizer
model = clf

### Si uso uno ya entrenado

Una version

In [None]:
loaded_model = joblib.load('..\\entrenados\\decision_tree\\joblib\\decision_tree_model_25k.pkl')
loaded_count_vectorizer = joblib.load('..\\entrenados\\decision_tree\\joblib\\decision_tree_vector_25k.pkl')


El que teniamos

In [None]:
loaded_model = pickle.load(open('..\\entrenados\\decision_tree\\dump\\decision_tree_model_25k.pkl', 'rb'))
lodead_count_vectorizer = pickle.load(open('..\\entrenados\\decision_tree\\dump\\decision_tree_vector_25k.pkl', 'rb'))

Asigno a las variables lo cargado

In [None]:
vect = loaded_count_vectorizer
model = loaded_model

### Usando un dataset

In [None]:
file_path = '..\\datasets\\Suicide_Detection.csv'

dataframeTest = pd.read_csv(file_path, skiprows=100004, nrows=1)
dataframeTest.columns = ["borrar","text","class"]
dataframeTest = dataframeTest.drop("borrar", axis=1)
dataframeTest['class'] = dataframeTest['class'].replace({"suicide": True, "non-suicide": False})

print(dataframeTest)

In [None]:
for index, row in dataframeTest.iterrows():
    texto = row['text']
    print(texto)
    
    translator = Translator()
    traduccion = translator.translate(texto, dest='es').text
    print(traduccion)
    

    texto_preprocesado = preprocessing_function(texto)
    texto_preprocesado_str = ' '.join(texto_preprocesado)

    texto_vectorizado = vect.transform([texto_preprocesado_str])

    prediccion = model.predict(texto_vectorizado)

    clase_real = row['class']
    resultado_prediccion = 'suicida' if prediccion else 'no suicida'
    resultado_real = 'suicida' if clase_real else 'no suicida'
    
    # Imprimir el resultado de la predicción y la clase real
    print(f"Predicción: {resultado_prediccion}. Clase real: {resultado_real}\n")

### Test manual

In [None]:
textos_prueba = [
                "I want to jump from a bridge",
                "I want to suicide me",
                 "I hate my parents with all my heart",
                 "I hate all about this life",
                 "I cry every night",
                 "I don't know what is happen to me, but I don't want live anymore"
                 ]

translator = Translator()

for texto in textos_prueba:
    traduccion = translator.translate(texto, dest='es').text
    print(traduccion)

    texto_preprocesado = preprocessing_function(texto)
    texto_preprocesado_str = ' '.join(texto_preprocesado)

    texto_vectorizado = vect.transform([texto_preprocesado_str])

    prediccion = model.predict(texto_vectorizado)

    print(f"Predicción para el texto '{texto}': {'suicida' if prediccion else 'no suicida'}\n\n")

## Verificar ocurrencia de palabras

### Palabras mas comunes contando palabra por palabra

In [None]:
# Intuición principal en este tipo de tecnicas: Contar las ocurrencias de las palabras.
from collections import defaultdict, Counter
from tqdm import tqdm

# Instanciamos un contador de python
word_counts = Counter()

for row in tqdm(dataframe.iterrows(), total=len(dataframe)):
    text = row[1]

    # Usamos la funcion implementada en pandas split() para separar palabras por espacios en blanco.
    for word in text.str.split()[0]:
        word_counts[word] += 1

len(word_counts)

In [None]:
word_counts.most_common(25)

### Palabras mas comunes contando con tokenizador

In [None]:
from nltk import word_tokenize

word_counts = Counter()

for row in tqdm(dataframe.iterrows(), total=len(dataframe)):
    text = row[1].iat[0]
    for word in word_tokenize(text):
        word_counts[word] += 1

len(word_counts)

In [None]:
word_counts.most_common(25)

### Palabras mas comunes contando teniendo en cuenta stopwords

In [None]:
from nltk.corpus import stopwords

stopwords_en = stopwords.words('english')

word_counts = Counter()

for row in tqdm(dataframe.iterrows(), total=len(dataframe)):
    text = row[1].iat[0]

    for word in word_tokenize(text):
        word_lowercase = word.lower()

        if word_lowercase not in stopwords_en:
            word_counts[word_lowercase] += 1

len(word_counts)

In [None]:
word_counts.most_common(10)

### Palabras mas comunes contando teniendo en cuenta signos de puntuacion

In [None]:
from string import punctuation

# Hacemos una union entre conjunto de caracteres de puntuacion nativos a nuestro conjunto de stopwords usando la operation union de sets de datos.
stopwords_en = set(stopwords_en).union(set(punctuation))

word_counts = Counter()

for row in tqdm(dataframe.iterrows(), total=len(dataframe)):
    text = row[1].iat[0]

    for word in word_tokenize(text):
        word_lowercase = word.lower()

        if word_lowercase not in stopwords_en:
            word_counts[word_lowercase] += 1

len(word_counts)

In [None]:
word_counts.most_common(10)

### Stemming

In [None]:
# Usamos Stemming
from nltk.stem import PorterStemmer

porter = PorterStemmer()

word_counts = Counter()

for row in tqdm(dataframe.iterrows(), total=len(dataframe)):
    text = row[1].iat[0]

    for word in word_tokenize(text):
        word_lowercase = word.lower()

        if word_lowercase not in stopwords_en:
            stemmed_word = porter.stem(word_lowercase)
            word_counts[stemmed_word] += 1

len(word_counts)

In [None]:
word_counts.most_common(10)

### Lematization

In [None]:
#Usemos Lemmatization:
from nltk import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer


wnl = WordNetLemmatizer()

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'

word_counts = Counter()

for row in tqdm(dataframe.iterrows(), total=len(dataframe)):
    text = row[1].iat[0]

    for word, tag in pos_tag(word_tokenize(text)):
        word_lemmatized = wnl.lemmatize(word.lower(), pos=penn2morphy(tag))

        if word_lemmatized not in stopwords_en:
            word_counts[word_lemmatized] += 1

len(word_counts)

In [None]:
word_counts.most_common(10)