In [1]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

import pandas as pd

from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from nlp_annotation import data_manager

[nltk_data] Downloading package punkt to /home/victor-
[nltk_data]     dualibi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
bucket = 'bella-insights-annotation'
data_key = 'bella-dataset-mutant'
files = ['caio.pereira.mutant.csv', 'kelly.silva.mutant.csv', 'valdete.machado.mutant.csv']

In [3]:
dataframe_list = []

for file in files:
    data_location = 's3://{}/{}/{}'.format(bucket, data_key, file)
    data_from_s3 = data_manager.create_dataframe_from_s3(bucket=bucket, key=data_key + "/" + file)
    dataframe_list.append(data_from_s3)

In [4]:
train_features = pd.concat(dataframe_list)
train_features = train_features.drop_duplicates(subset=['data'])

In [5]:
train_features.shape

(400, 3)

In [6]:
train_features.head()

Unnamed: 0,id,data,label
0,2202,você recebeu pagamento com point,Renda
1,2203,b2w companhia digital,Compras / Loja de departamento
2,2204,recarga*recarga cel,Contas / Celular
3,2205,recarga oi,Contas / Celular
4,2206,você pagou o mercado crédito,Compras / Apps


In [7]:
pipeline_list = [
    ("count_vectorizer", CountVectorizer(tokenizer=word_tokenize, token_pattern=None, ngram_range=(1,4))),
    ("logistic_regressor", LogisticRegression(solver='liblinear'))
]

model_pipeline = Pipeline(pipeline_list)


In [8]:
features = train_features["data"]
target = train_features["label"]

X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42, test_size=0.2,
                                                    shuffle=True, stratify=None)

In [9]:
model_pipeline.fit(X_train, y_train)

Pipeline(steps=[('count_vectorizer',
                 CountVectorizer(ngram_range=(1, 4), token_pattern=None,
                                 tokenizer=<function word_tokenize at 0x7f767b0ba940>)),
                ('logistic_regressor', LogisticRegression(solver='liblinear'))])

In [10]:
train_y_pred = model_pipeline.predict(X_train)
test_y_pred = model_pipeline.predict(X_test)

In [11]:
def calculate_accuracy(*, real_values, predicted_values):
    real_vector = real_values.copy()
    counter = 0
    vector_size = len(real_vector)
    
    for i, classes in enumerate(real_vector):
        if classes == predicted_values[i]:
            counter += 1
    
    accuracy = round(counter / vector_size, 2)
    
    return accuracy
    

In [12]:
calculate_accuracy(real_values=y_train, predicted_values=train_y_pred)

0.88

In [13]:
calculate_accuracy(real_values=y_test, predicted_values=test_y_pred)

0.31

In [18]:
from nlp_annotation import modelling, pipeline

In [19]:
pipeline.model_pipeline

Pipeline(steps=[('count_vectorizer',
                 CountVectorizer(ngram_range=['1', '4'], token_pattern='None',
                                 tokenizer=<function word_tokenize at 0x7f767b0ba940>)),
                ('logistic_regressor', LogisticRegression(solver='liblinear'))])

In [20]:
model_pipeline

Pipeline(steps=[('count_vectorizer',
                 CountVectorizer(ngram_range=(1, 4), token_pattern=None,
                                 tokenizer=<function word_tokenize at 0x7f767b0ba940>)),
                ('logistic_regressor', LogisticRegression(solver='liblinear'))])