# Clonando Repositório

In [1]:
!git clone https://github.com/ufrpe-mineracao-textos/projeto-de-mineracao-20192-equipy.git

Cloning into 'projeto-de-mineracao-20192-equipy'...
remote: Enumerating objects: 27, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 27 (delta 11), reused 18 (delta 6), pack-reused 0[K
Unpacking objects: 100% (27/27), done.


# Carregando Dados

In [2]:
import spacy
import pandas as pd
from spacy.lang.pt.stop_words import STOP_WORDS

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn import metrics

!python -m spacy download pt

Collecting pt_core_news_sm==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-2.1.0/pt_core_news_sm-2.1.0.tar.gz (12.8MB)
[K     |████████████████████████████████| 12.9MB 1.8MB/s 
[?25hBuilding wheels for collected packages: pt-core-news-sm
  Building wheel for pt-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for pt-core-news-sm: filename=pt_core_news_sm-2.1.0-cp36-none-any.whl size=12843677 sha256=f137148fef56ef3eeb02a3ba59ff6dfc5251309a4b490b1723c1876b9328b0db
  Stored in directory: /tmp/pip-ephem-wheel-cache-iqwkui2e/wheels/a3/8f/c1/f036e3a7f1aa44fb06a534c6c4b1c2b773f101fdb1f163c08c
Successfully built pt-core-news-sm
Installing collected packages: pt-core-news-sm
Successfully installed pt-core-news-sm-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('pt_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/pt_core_news_sm -->
/usr/

In [0]:
nlp = spacy.load('pt')

path = "projeto-de-mineracao-20192-equipy/"
data = pd.read_csv(path+"new_dataset.csv")

docs = []
docs_lemma = []

for index, row in data.iterrows():
  text = row['text'].lower()
  text = nlp(text)
  
  text_x = "".join([token.text + " " for token in text if not (token.is_punct or token.is_stop)])
  # Remove as pontuações e as stop words e faz o Lemmatization
  text_lemma = "".join([token.lemma_ + " " for token in text if not (token.is_punct or token.is_stop)])

  docs.append({ 'text': text_x, 'label': row['label'] }) 
  docs_lemma.append({ 'text': text_lemma }) 

docs = pd.DataFrame(docs)
docs_lemma = pd.DataFrame(docs_lemma)

texto, classe = docs['text'], docs['label']
texto_lemma = docs_lemma['text']

In [0]:
treino_texto = texto[:1300]
teste_texto = texto[1301:2000]

treino_classe = classe[:1300]
teste_classe = classe[1301:2000]

treino_lemma_texto = texto_lemma[:1300]
teste_lemma_texto = texto_lemma[1301:2000]

# Extração de caracteristicas

## Sem Lemmatization

In [0]:
count_vect = CountVectorizer(encoding='latin-1')

X_treino_counts = count_vect.fit_transform(treino_texto)
X_treino_counts.shape

tfidf_transformer = TfidfTransformer(use_idf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X_treino_counts)
X_train_tfidf.shape

X_teste_counts = count_vect.transform(teste_texto)
X_teste_tfidf = tfidf_transformer.transform(X_teste_counts)

## Com Lemmatization

In [0]:
count_vect_lemma = CountVectorizer(encoding='latin-1')

X_treino_lemma_counts = count_vect_lemma.fit_transform(treino_lemma_texto)
X_treino_lemma_counts.shape

tfidf_transformer_lemma = TfidfTransformer(use_idf=True)
X_train_lemma_tfidf = tfidf_transformer_lemma.fit_transform(X_treino_lemma_counts)
X_train_lemma_tfidf.shape

X_teste_lemma_counts = count_vect_lemma.transform(teste_lemma_texto)
X_teste_lemma_tfidf = tfidf_transformer_lemma.transform(X_teste_lemma_counts)

# Classificação

## Naive Bayes

### Sem Lemmatization

In [0]:
classf_nb = MultinomialNB().fit(X_train_tfidf, treino_classe)

#### Avaliação

In [0]:
predito_nb = classf_nb.predict(X_teste_tfidf)
accuracy_score(teste_classe, predito_nb)

print(metrics.classification_report(teste_classe, predito_nb))

              precision    recall  f1-score   support

           0       0.78      0.54      0.64       399
           1       0.57      0.80      0.66       300

    accuracy                           0.65       699
   macro avg       0.68      0.67      0.65       699
weighted avg       0.69      0.65      0.65       699



### Com Lemmatization

In [0]:
classf_nb_lemma = MultinomialNB().fit(X_train_lemma_tfidf, treino_classe)

#### Avaliação

In [0]:
predito_nb_lemma = classf_nb_lemma.predict(X_teste_lemma_tfidf)
accuracy_score(teste_classe, predito_nb_lemma)

print(metrics.classification_report(teste_classe, predito_nb_lemma))

              precision    recall  f1-score   support

           0       0.78      0.56      0.65       399
           1       0.58      0.79      0.67       300

    accuracy                           0.66       699
   macro avg       0.68      0.68      0.66       699
weighted avg       0.69      0.66      0.66       699



## Árvore de Decisão

### Sem Lemmatization

In [0]:
classf_tree = DecisionTreeClassifier().fit(X_train_tfidf, treino_classe)

#### Avaliação

In [0]:
predito_tree = classf_tree.predict(X_teste_tfidf)
accuracy_score(teste_classe, predito_tree)

print(metrics.classification_report(teste_classe, predito_tree))

              precision    recall  f1-score   support

           0       0.69      0.37      0.48       399
           1       0.48      0.78      0.60       300

    accuracy                           0.55       699
   macro avg       0.59      0.58      0.54       699
weighted avg       0.60      0.55      0.53       699



### Com Lemmatization

In [0]:
classf_tree_lemma = DecisionTreeClassifier().fit(X_train_lemma_tfidf, treino_classe)

#### Avaliação

In [0]:
predito_tree_lemma = classf_tree_lemma.predict(X_teste_lemma_tfidf)
accuracy_score(teste_classe, predito_tree_lemma)

print(metrics.classification_report(teste_classe, predito_tree_lemma))

              precision    recall  f1-score   support

           0       0.65      0.36      0.46       399
           1       0.46      0.74      0.57       300

    accuracy                           0.52       699
   macro avg       0.56      0.55      0.52       699
weighted avg       0.57      0.52      0.51       699



## Redes Neurais

### Sem Lemmatization

In [0]:
classf_mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1).fit(X_train_tfidf, treino_classe)

#### Avaliação

In [0]:
predito_mlp = classf_mlp.predict(X_teste_tfidf)
accuracy_score(teste_classe, predito_mlp)

print(metrics.classification_report(teste_classe, predito_mlp))

              precision    recall  f1-score   support

           0       0.66      0.76      0.71       399
           1       0.61      0.49      0.54       300

    accuracy                           0.65       699
   macro avg       0.64      0.63      0.63       699
weighted avg       0.64      0.65      0.64       699



### Com Lemmatization

In [0]:
classf_mlp_lemma = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1).fit(X_train_lemma_tfidf, treino_classe)

#### Avaliação

In [0]:
predito_mlp_lemma = classf_mlp_lemma.predict(X_teste_lemma_tfidf)
accuracy_score(teste_classe, predito_mlp_lemma)

print(metrics.classification_report(teste_classe, predito_mlp_lemma))

              precision    recall  f1-score   support

           0       0.57      0.98      0.72       399
           1       0.58      0.04      0.07       300

    accuracy                           0.58       699
   macro avg       0.58      0.51      0.40       699
weighted avg       0.58      0.58      0.44       699



## Random Forests

### Sem Lemmatization

In [0]:
classf_rf = RandomForestClassifier(n_estimators=100).fit(X_train_tfidf, treino_classe)

#### Avaliação

In [20]:
predito_rf = classf_rf.predict(X_teste_tfidf)
accuracy_score(teste_classe, predito_rf)

print(metrics.classification_report(teste_classe, predito_rf))

              precision    recall  f1-score   support

           0       0.79      0.30      0.43       399
           1       0.49      0.89      0.63       300

    accuracy                           0.55       699
   macro avg       0.64      0.59      0.53       699
weighted avg       0.66      0.55      0.52       699



### Com Lemmatization

In [0]:
classf_rf_lemma = RandomForestClassifier(n_estimators=100).fit(X_train_lemma_tfidf, treino_classe)

#### Avaliação

In [22]:
predito_rf_lemma = classf_rf_lemma.predict(X_teste_lemma_tfidf)
accuracy_score(teste_classe, predito_rf_lemma)

print(metrics.classification_report(teste_classe, predito_rf_lemma))

              precision    recall  f1-score   support

           0       0.77      0.26      0.39       399
           1       0.48      0.90      0.62       300

    accuracy                           0.53       699
   macro avg       0.63      0.58      0.51       699
weighted avg       0.65      0.53      0.49       699

