## Trabalho 
#### Carregar o conjunto de dados do site de complaints e categorizar o tipo de produto baseado no texto do usuário

### 1. Carregando o dataset 

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('complaints.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
data.shape

(2124146, 18)

In [5]:
data = data[data['Consumer complaint narrative'].notna()]

In [6]:
X = data['Consumer complaint narrative'].head(10000)
y = data['Product'].head(10000)

In [7]:
X

5        Two accounts are still on my credit history af...
9        I opened a citi double cash card the beginning...
10       I filed 2 disputes with XXXX XXXX XXXX in XX/X...
11       In dispute of the loan # XXXX, for XXXX XXXX t...
31       Notice to whom it be of concern,It is YOU I ho...
                               ...                        
33159    Two rounds of dispute letters were sent to rem...
33161    Summit Financial is continuously reporting I o...
33162    I received a notification on XXXX XX/XX/2019 v...
33165    On XX/XX/XXXX I used a service call JPay to se...
33166    I am a victim of identity theft. The informati...
Name: Consumer complaint narrative, Length: 10000, dtype: object

In [8]:
y

5        Credit reporting, credit repair services, or o...
9                              Credit card or prepaid card
10       Credit reporting, credit repair services, or o...
11                                                Mortgage
31       Credit reporting, credit repair services, or o...
                               ...                        
33159    Credit reporting, credit repair services, or o...
33161    Credit reporting, credit repair services, or o...
33162                                      Debt collection
33165    Money transfer, virtual currency, or money ser...
33166    Credit reporting, credit repair services, or o...
Name: Product, Length: 10000, dtype: object

### 2. Testes com alguns modelos

#### 2.1 Textos usando para testes

In [9]:
textos = ['I do not like to study', 
          'I want to go to college', 
          'I will be a computer student',
          'I need money',
          'I like data science',
          'I used my emails years ago paying for a transaction',
          'Student, student, student.']

#### 2.2 Usando o CountVectorizer

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
#Pré-processamento de texto, tokenização e filtragem de palavras irrelevantes estão incluídos no CountVectorizer, 
#que cria um dicionário de recursos e transforma documentos em vetores de recursos.

In [11]:
count_vect = CountVectorizer(lowercase=True)
X_train = count_vect.fit_transform(X)

In [12]:
X_train.shape

(10000, 19980)

In [13]:
X_train[0]

<1x19980 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [14]:
X_train[0].todense()

matrix([[0, 0, 0, ..., 0, 0, 0]])

In [15]:
count_vect.vocabulary_.get(u'student')

16965

In [16]:
teste = count_vect.transform(textos)

In [17]:
from sklearn.naive_bayes import MultinomialNB
#Classificador para tentar prever a categoria de uma postagem.

In [18]:
nb = MultinomialNB()
nb.fit(X_train,y)

MultinomialNB()

In [19]:
preds = nb.predict(teste)

In [20]:
for doc, pred in zip(textos, preds):
    print(f'Texto: {doc} \nClasse: {pred}\n')

Texto: I do not like to study 
Classe: Credit reporting, credit repair services, or other personal consumer reports

Texto: I want to go to college 
Classe: Student loan

Texto: I will be a computer student 
Classe: Credit reporting, credit repair services, or other personal consumer reports

Texto: I need money 
Classe: Credit reporting, credit repair services, or other personal consumer reports

Texto: I like data science 
Classe: Credit reporting, credit repair services, or other personal consumer reports

Texto: I used my emails years ago paying for a transaction 
Classe: Credit card or prepaid card

Texto: Student, student, student. 
Classe: Student loan



### 3. Separando a base em treino e teste

In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y)

### 4. Avaliando o modelo CountVectorizer

In [22]:
X_test_countVect = count_vect.transform(X_test)

In [23]:
nb.score(X_test_countVect,y_test)

0.8224

In [24]:
from sklearn.ensemble import RandomForestClassifier 

### 5. Adicionando em pipelines

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

#### 5.1 CountVectorizer com MultinomialNB

In [26]:
pipeline = Pipeline([
                        ('count_vect', CountVectorizer(lowercase=True,stop_words='english')),
                        ('clf', MultinomialNB())
                    ])

In [27]:
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

0.7804

#### 5.2 CountVectorizer com RandomForestClassifier

In [28]:
pipeline = Pipeline([
                        ('count_vect', CountVectorizer(lowercase=True,stop_words='english')),
                        ('clf', RandomForestClassifier())
                    ])

In [29]:
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

0.7384

#### 5.3 TfidfVectorizer com MultinomialNB

In [30]:
pipeline = Pipeline([
                        ('count_vect', TfidfVectorizer(lowercase=True,stop_words='english')),
                        ('clf', MultinomialNB())
                    ])


In [31]:
pipeline.fit(X_train,y_train)
pipeline.score(X_test,y_test)

0.526

#### 5.3 CountVectorizer, TfidfTransformer e MultinomialNB

In [32]:
#TfidfTransformer - Transform a count matrix to a normalized tf or tf-idf representation
pipeline = Pipeline([
                         ('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()), #usando l1 deu um resultado pior que o l2
                         ('clf', MultinomialNB()),
                    ])

In [33]:
pipeline.fit(X_train,y_train)
pipeline.score(X_test,y_test)

0.4908

#### 5.4 CountVectorizer, TfidfTransformer e SGDClassifier

In [34]:
pipeline = Pipeline([
                         ('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                               alpha=1e-3, random_state=42,
                                               max_iter=5, tol=None)),
                     ])

In [35]:
pipeline.fit(X_train,y_train)
pipeline.score(X_test,y_test)

0.782