## Complaints Classifier

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

### Import dataset
To get the dataset click [here](https://www.kaggle.com/cfpb/us-consumer-finance-complaints)

In [None]:
data = pd.read_csv('../input/us-consumer-finance-complaints/consumer_complaints.csv')

In [None]:
data.head()

In [None]:
data = data[['product', 'consumer_complaint_narrative']]
data.isnull().sum()

In [None]:
data = data.dropna()
data.head()

In [None]:
prod_categ = list(data['product'].unique())
print(len(prod_categ))
prod_category = []
for prod in data['product']:
    prod_category.append(prod_categ.index(prod))

In [None]:
data['prod_category'] = prod_category
data.head()

### Using TF-IDF Vectorizer to convert categorical data to numerical.

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, norm='l2', min_df=5, stop_words='english', ngram_range=(1,2))

In [None]:
X = tfidf.fit_transform(data.consumer_complaint_narrative)
y = data.prod_category
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred)*100)