# Clonando Repositório

In [1]:
!git clone https://github.com/ufrpe-mineracao-textos/projeto-de-mineracao-20192-equipy.git

Cloning into 'projeto-de-mineracao-20192-equipy'...
remote: Enumerating objects: 30, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 30 (delta 13), reused 18 (delta 6), pack-reused 0[K
Unpacking objects: 100% (30/30), done.


# Carregando Dados

In [2]:
import spacy
import pandas as pd
from spacy.lang.pt.stop_words import STOP_WORDS

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn import metrics

!python -m spacy download pt

Collecting pt_core_news_sm==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-2.1.0/pt_core_news_sm-2.1.0.tar.gz (12.8MB)
[K     |████████████████████████████████| 12.9MB 1.7MB/s 
[?25hBuilding wheels for collected packages: pt-core-news-sm
  Building wheel for pt-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for pt-core-news-sm: filename=pt_core_news_sm-2.1.0-cp36-none-any.whl size=12843677 sha256=ae959ec078ee4729855b1bf8e75ad3d10b5673e4fbfd32e18917919ca63e6db1
  Stored in directory: /tmp/pip-ephem-wheel-cache-nfw61rzz/wheels/a3/8f/c1/f036e3a7f1aa44fb06a534c6c4b1c2b773f101fdb1f163c08c
Successfully built pt-core-news-sm
Installing collected packages: pt-core-news-sm
Successfully installed pt-core-news-sm-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('pt_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/pt_core_news_sm -->
/usr/

In [0]:
nlp = spacy.load('pt')

path = "projeto-de-mineracao-20192-equipy/"
data = pd.read_csv(path+"new_dataset.csv")

docs = []
docs_lemma = []

for index, row in data.iterrows():
  text = row['text'].lower()
  text = nlp(text)
  
  text_x = "".join([token.text + " " for token in text if not (token.is_punct or token.is_stop)])
  # Remove as pontuações e as stop words e faz o Lemmatization
  text_lemma = "".join([token.lemma_ + " " for token in text if not (token.is_punct or token.is_stop)])

  docs.append({ 'text': text_x, 'label': row['label'] }) 
  docs_lemma.append({ 'text': text_lemma }) 

docs = pd.DataFrame(docs)
docs_lemma = pd.DataFrame(docs_lemma)

texto, classe = docs['text'], docs['label']
texto_lemma = docs_lemma['text']

In [0]:
treino_texto = texto[:1300]
teste_texto = texto[1301:2000]

treino_classe = classe[:1300]
teste_classe = classe[1301:2000]

treino_lemma_texto = texto_lemma[:1300]
teste_lemma_texto = texto_lemma[1301:2000]

# Extração de caracteristicas

## Sem Lemmatization

In [0]:
count_vect = CountVectorizer(encoding='latin-1')

X_treino_counts = count_vect.fit_transform(treino_texto)
X_treino_counts.shape

tfidf_transformer = TfidfTransformer(use_idf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X_treino_counts)
X_train_tfidf.shape

X_teste_counts = count_vect.transform(teste_texto)
X_teste_tfidf = tfidf_transformer.transform(X_teste_counts)

## Com Lemmatization

In [0]:
count_vect_lemma = CountVectorizer(encoding='latin-1')

X_treino_lemma_counts = count_vect_lemma.fit_transform(treino_lemma_texto)
X_treino_lemma_counts.shape

tfidf_transformer_lemma = TfidfTransformer(use_idf=True)
X_train_lemma_tfidf = tfidf_transformer_lemma.fit_transform(X_treino_lemma_counts)
X_train_lemma_tfidf.shape

X_teste_lemma_counts = count_vect_lemma.transform(teste_lemma_texto)
X_teste_lemma_tfidf = tfidf_transformer_lemma.transform(X_teste_lemma_counts)

# Classificação

## Naive Bayes

### Sem Lemmatization

In [0]:
classf_nb = MultinomialNB().fit(X_train_tfidf, treino_classe)

#### Avaliação

In [8]:
predito_nb = classf_nb.predict(X_teste_tfidf)
accuracy_score(teste_classe, predito_nb)

print(metrics.classification_report(teste_classe, predito_nb))

              precision    recall  f1-score   support

           0       0.78      0.54      0.64       399
           1       0.57      0.80      0.66       300

    accuracy                           0.65       699
   macro avg       0.68      0.67      0.65       699
weighted avg       0.69      0.65      0.65       699



#### Cross Validation

In [9]:
scores = cross_val_score(classf_nb, X_train_tfidf, treino_classe, cv=10)
scores_f1 = cross_val_score(classf_nb, X_train_tfidf, treino_classe, cv=10, scoring='f1_macro')

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("F-Measure: %0.2f (+/- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

Accuracy: 0.72 (+/- 0.10)
F-Measure: 0.68 (+/- 0.15)


### Com Lemmatization

In [0]:
classf_nb_lemma = MultinomialNB().fit(X_train_lemma_tfidf, treino_classe)

#### Avaliação

In [11]:
predito_nb_lemma = classf_nb_lemma.predict(X_teste_lemma_tfidf)
accuracy_score(teste_classe, predito_nb_lemma)

print(metrics.classification_report(teste_classe, predito_nb_lemma))

              precision    recall  f1-score   support

           0       0.78      0.56      0.65       399
           1       0.58      0.79      0.67       300

    accuracy                           0.66       699
   macro avg       0.68      0.68      0.66       699
weighted avg       0.69      0.66      0.66       699



#### Cross Validation

In [12]:
scores = cross_val_score(classf_nb_lemma, X_train_lemma_tfidf, treino_classe, cv=10)
scores_f1 = cross_val_score(classf_nb_lemma, X_train_lemma_tfidf, treino_classe, cv=10, scoring='f1_macro')

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("F-Measure: %0.2f (+/- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

Accuracy: 0.72 (+/- 0.14)
F-Measure: 0.67 (+/- 0.20)


## Árvore de Decisão

### Sem Lemmatization

In [0]:
classf_tree = DecisionTreeClassifier().fit(X_train_tfidf, treino_classe)

#### Avaliação

In [14]:
predito_tree = classf_tree.predict(X_teste_tfidf)
accuracy_score(teste_classe, predito_tree)

print(metrics.classification_report(teste_classe, predito_tree))

              precision    recall  f1-score   support

           0       0.67      0.39      0.49       399
           1       0.48      0.74      0.58       300

    accuracy                           0.54       699
   macro avg       0.57      0.57      0.54       699
weighted avg       0.59      0.54      0.53       699



#### Cross Validation

In [15]:
scores = cross_val_score(classf_tree, X_train_tfidf, treino_classe, cv=10)
scores_f1 = cross_val_score(classf_tree, X_train_tfidf, treino_classe, cv=10, scoring='f1_macro')

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("F-Measure: %0.2f (+/- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

Accuracy: 0.67 (+/- 0.18)
F-Measure: 0.65 (+/- 0.18)


### Com Lemmatization

In [0]:
classf_tree_lemma = DecisionTreeClassifier().fit(X_train_lemma_tfidf, treino_classe)

#### Avaliação

In [17]:
predito_tree_lemma = classf_tree_lemma.predict(X_teste_lemma_tfidf)
accuracy_score(teste_classe, predito_tree_lemma)

print(metrics.classification_report(teste_classe, predito_tree_lemma))

              precision    recall  f1-score   support

           0       0.66      0.38      0.48       399
           1       0.47      0.73      0.57       300

    accuracy                           0.53       699
   macro avg       0.56      0.56      0.53       699
weighted avg       0.58      0.53      0.52       699



#### Cross Validation

In [18]:
scores = cross_val_score(classf_tree_lemma, X_train_lemma_tfidf, treino_classe, cv=10)
scores_f1 = cross_val_score(classf_tree_lemma, X_train_lemma_tfidf, treino_classe, cv=10, scoring='f1_macro')

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("F-Measure: %0.2f (+/- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

Accuracy: 0.68 (+/- 0.12)
F-Measure: 0.66 (+/- 0.15)


## Redes Neurais

### Sem Lemmatization

In [0]:
classf_mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1).fit(X_train_tfidf, treino_classe)

#### Avaliação

In [20]:
predito_mlp = classf_mlp.predict(X_teste_tfidf)
accuracy_score(teste_classe, predito_mlp)

print(metrics.classification_report(teste_classe, predito_mlp))

              precision    recall  f1-score   support

           0       0.66      0.76      0.71       399
           1       0.61      0.49      0.54       300

    accuracy                           0.65       699
   macro avg       0.64      0.63      0.63       699
weighted avg       0.64      0.65      0.64       699



#### Cross Validation

In [22]:
scores = cross_val_score(classf_mlp, X_train_tfidf, treino_classe, cv=10)
scores_f1 = cross_val_score(classf_mlp, X_train_tfidf, treino_classe, cv=10, scoring='f1_macro')

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("F-Measure: %0.2f (+/- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Accuracy: 0.66 (+/- 0.13)
F-Measure: 0.50 (+/- 0.30)


  'precision', 'predicted', average, warn_for)


### Com Lemmatization

In [0]:
classf_mlp_lemma = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1).fit(X_train_lemma_tfidf, treino_classe)

#### Avaliação

In [24]:
predito_mlp_lemma = classf_mlp_lemma.predict(X_teste_lemma_tfidf)
accuracy_score(teste_classe, predito_mlp_lemma)

print(metrics.classification_report(teste_classe, predito_mlp_lemma))

              precision    recall  f1-score   support

           0       0.57      0.98      0.72       399
           1       0.58      0.04      0.07       300

    accuracy                           0.58       699
   macro avg       0.58      0.51      0.40       699
weighted avg       0.58      0.58      0.44       699



#### Cross Validation

In [25]:
scores = cross_val_score(classf_mlp_lemma, X_train_lemma_tfidf, treino_classe, cv=10)
scores_f1 = cross_val_score(classf_mlp_lemma, X_train_lemma_tfidf, treino_classe, cv=10, scoring='f1_macro')

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("F-Measure: %0.2f (+/- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

Accuracy: 0.71 (+/- 0.23)
F-Measure: 0.69 (+/- 0.23)


## Random Forests

### Sem Lemmatization

In [0]:
classf_rf = RandomForestClassifier(n_estimators=100).fit(X_train_tfidf, treino_classe)

#### Avaliação

In [27]:
predito_rf = classf_rf.predict(X_teste_tfidf)
accuracy_score(teste_classe, predito_rf)

print(metrics.classification_report(teste_classe, predito_rf))

              precision    recall  f1-score   support

           0       0.75      0.28      0.40       399
           1       0.48      0.88      0.62       300

    accuracy                           0.54       699
   macro avg       0.62      0.58      0.51       699
weighted avg       0.63      0.54      0.50       699



#### Cross Validation

In [28]:
scores = cross_val_score(classf_rf, X_train_tfidf, treino_classe, cv=10)
scores_f1 = cross_val_score(classf_rf, X_train_tfidf, treino_classe, cv=10, scoring='f1_macro')

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("F-Measure: %0.2f (+/- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

Accuracy: 0.72 (+/- 0.13)
F-Measure: 0.67 (+/- 0.19)


### Com Lemmatization

In [0]:
classf_rf_lemma = RandomForestClassifier(n_estimators=100).fit(X_train_lemma_tfidf, treino_classe)

#### Avaliação

In [30]:
predito_rf_lemma = classf_rf_lemma.predict(X_teste_lemma_tfidf)
accuracy_score(teste_classe, predito_rf_lemma)

print(metrics.classification_report(teste_classe, predito_rf_lemma))

              precision    recall  f1-score   support

           0       0.78      0.23      0.36       399
           1       0.47      0.91      0.62       300

    accuracy                           0.53       699
   macro avg       0.63      0.57      0.49       699
weighted avg       0.65      0.53      0.47       699



#### Cross Validation

In [32]:
scores = cross_val_score(classf_rf_lemma, X_train_lemma_tfidf, treino_classe, cv=10)
scores_f1 = cross_val_score(classf_rf_lemma, X_train_lemma_tfidf, treino_classe, cv=10, scoring='f1_macro')

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("F-Measure: %0.2f (+/- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

Accuracy: 0.73 (+/- 0.15)
F-Measure: 0.68 (+/- 0.20)
