In [27]:
import os
import joblib
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from scipy.stats import randint
import seaborn as sns # used for plot interactive graph. 
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [28]:
# loading data
df = pd.read_excel('/content/data.xlsx')

In [29]:
df.shape

(187, 3)

In [30]:
# Create a new column 'category_id' with encoded categories 
df['category_id'] = df['Category'].factorize()[0]
category_id_df = df[['Category', 'category_id']].drop_duplicates()
# Dictionaries for future use
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Category']].values)
# New dataframe
df.head(180)

Unnamed: 0.1,Unnamed: 0,Text,Category,category_id
0,0,I'm sorry. I feel like such a jerk.,Defuse,0
1,1,I'm sorry. Clearly I've made a mistake.,Defuse,0
2,4,That’s not a problem. Can I give you my number...,Defuse,0
3,5,oh nice where did you go?,Defuse,0
4,6,I'm sorry. I feel like such a jerk.,Defuse,0
...,...,...,...,...
175,227,I'm about to ask you a direct question. \n\nI ...,Discuss,2
176,228,(If competitor.) \n\nWould it make sense for y...,Discuss,2
177,229,What price were you expecting?,Discuss,2
178,230,What price were you expecting?\n\nIt seems lik...,Discuss,2


In [31]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,ngram_range=(1, 2), stop_words='english')
# We transform each article_text into a vector
features = tfidf.fit_transform(df.Category.values.astype('U')).toarray()
labels = df.category_id
# print("Each of the %d article_text is represented by %d features (TF-IDF score of unigrams and bigrams)" %(features.shape))

In [32]:
X = df[['Text']] # Collection of documents
y = df[['Category']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state = 0)

In [33]:
models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))

entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [34]:
mean_accuracy = cv_df.groupby('model_name').accuracy.mean()
std_accuracy = cv_df.groupby('model_name').accuracy.std()

acc = pd.concat([mean_accuracy, std_accuracy], axis= 1, 
          ignore_index=True)
acc.columns = ['Mean Accuracy', 'Standard deviation']
acc

Unnamed: 0_level_0,Mean Accuracy,Standard deviation
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
LinearSVC,1.0,0.0
LogisticRegression,1.0,0.0
MultinomialNB,1.0,0.0
RandomForestClassifier,1.0,0.0


In [35]:
X_train, X_test, y_train, y_test,indices_train,indices_test = train_test_split(features, labels, df.index, test_size=0.25, random_state=1)
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state = 0)
X_train = np.array(X_train).tolist()
y_train = np.array(y_train).tolist()
X_train = list(map(''.join, X_train))
y_train = list(map(''.join, y_train))
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=1,ngram_range=(1, 2), stop_words='english')
fitted_vectorizer = tfidf.fit(X_train)
tfidf_vectorizer_vectors = fitted_vectorizer.transform(X_train)
model = LinearSVC().fit(tfidf_vectorizer_vectors, y_train)

In [37]:
def output(text):
  final = model.predict(fitted_vectorizer.transform([text]))
  result = ''.join(final)
  return result

In [38]:
text = """What next steps would you like to take from here?"""
final = model.predict(fitted_vectorizer.transform([text]))
result = ''.join(final)
print(result)

Discuss


In [39]:
import pickle
# Save the model
mod_file = 'modelFile.model'
pickle.dump(model , open(mod_file, 'wb'))

In [40]:
vec_file = 'fitted_vectorizer.pickle'
pickle.dump(fitted_vectorizer, open(vec_file, 'wb'))

**Save Model**

In [41]:
# Save the model
mod_file = 'modelFile.model'
pickle.dump(model , open(mod_file, 'wb'))
# Save Function used for prediction
vec_file = 'fitted_vectorizer.pickle'
pickle.dump(fitted_vectorizer, open(vec_file, 'wb'))


In [42]:
# # For Saving Model
# joblib.dump(model,"model.pkl")
# # For Loading Model
# model = joblib.load('/content/model.pkl')

In [43]:
# import pickle
## For Saving Model
# with open('model_pickle', 'wb') as f:
#   pickle.dump(model,f)
## For Loading Model
# with open('model_pickle', 'rb') as f:
#   mp = pickle.load(f)

**How to Used Saved Model Somewhere**

In [44]:
import pickle
# load the model
loaded_model = pickle.load(open('modelFile.model', 'rb'))
# load function
l_model = pickle.load(open('fitted_vectorizer.pickle', 'rb'))

In [45]:
text = """What price were you expecting?

It seems like you might value price over functionality, delivery time and quality."""
final = loaded_model.predict(l_model.transform([text]))
result = ''.join(final)
print(result)

Discuss
