# Order Pizza Classification Dataset

1. Import the Dataset
2. Text Preprocessing (Tokenization, Stopwords, Stemming, Lemmatization, NLTK)
3. Text convert into vectors (Bag of words, TF-IDF)
4. Train Test Split
5. Prediction

# Import the Dataset

In [1]:
#Import Libraries
import pandas as pd
import csv


In [2]:
#Import the Dataset
df = pd.read_csv("Dataset.csv")

In [3]:
df.head(10)

Unnamed: 0,text,intentName
0,dont want the order anymore,CancelOrder
1,dont make my order anymore,CancelOrder
2,cancel the order,CancelOrder
3,cancel that order,CancelOrder
4,i would like to order three hawaiian pizzas wi...,ModifyOrder
5,i would like to order two small chicken pizzas...,ModifyOrder
6,i would like two cheese pizzas with extra anch...,ModifyOrder
7,i would like two cheese pizzas with extra anch...,ModifyOrder
8,i would like two extra large pizzas with ham,ModifyOrder
9,pepperoni pizzas without olives but extra sauce,ModifyOrder


In [4]:
# shuffle the DataFrame rows
df = df.sample(frac=1)

In [5]:
df.head(10)

Unnamed: 0,text,intentName
67,what is my name,
23,i would like three pepperoni pizzas hold the s...,ModifyOrder
14,looks right,Confirmation
11,let me get two small chicken pizzas without ja...,ModifyOrder
40,order one pizza,ModifyOrder
58,i would like a large vegetarian pizza half oni...,ModifyOrder
51,correct,Confirmation
80,i want one extra large pepperoni pizza with ex...,ModifyOrder
48,yes,Confirmation
39,order one mandarin pizza thin crust with extra...,ModifyOrder


In [6]:
df['text'].loc[46]

'order a chicken pizza'

# Data cleaning and preprocessing

In [7]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TharakaG\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [9]:
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z0-9]', ' ', df['text'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')] 
    review = ' '.join(review)
    corpus.append(review)

In [10]:
corpus

['dont want order anymor',
 'dont make order anymor',
 'cancel order',
 'cancel order',
 'would like order three hawaiian pizza without ham add oliv',
 'would like order two small chicken pizza pepperoni',
 'would like two chees pizza extra anchovi small chicken pizza without ham pineappl',
 'would like two chees pizza extra anchovi small chicken pizza without ham pineappl anchovi',
 'would like two extra larg pizza ham',
 'pepperoni pizza without oliv extra sauc',
 'id like order larg pizza extra pepperoni extra chicken extra chees',
 'let get two small chicken pizza without jalapeno add ham',
 'want two extra larg beef suprem pizza without pepperoni oliv',
 'look good',
 'look right',
 'pretti much',
 'right',
 'confirm',
 'confirm',
 'awesom',
 'hello',
 'would like medium pizza extra mushroom light sausag remov chees',
 'would like pepperoni pizza extra chees side ranch',
 'would like three pepperoni pizza hold sauc',
 'would like order four chees pizza extra chees chicken pizza wi

# Creating the Bag of Words model

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500, binary = True, ngram_range = (2,2))
X = cv.fit_transform(corpus).toarray()

In [12]:
X[1]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0], dtype=int64)

In [13]:
y=pd.get_dummies(df['intentName'])
y=y.iloc[:,1].values

In [14]:
y

array([0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0], dtype=uint8)

# Train Test Split

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [16]:
X_train, y_train

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64),
 array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0],
       dtype=uint8))

In [17]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

# Prediction

In [18]:
y_pred=spam_detect_model.predict(X_test)

In [19]:
from sklearn.metrics import accuracy_score,classification_report

In [20]:
score=accuracy_score(y_test,y_pred)
print(score)

0.6470588235294118


In [21]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.73      0.85      0.79        13
           1       0.00      0.00      0.00         4

    accuracy                           0.65        17
   macro avg       0.37      0.42      0.39        17
weighted avg       0.56      0.65      0.60        17



# Creating the TFIDF model

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500, ngram_range = (1,2))
X = tv.fit_transform(corpus).toarray()

# Train Test Split

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [24]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

# prediction

In [25]:
y_pred=spam_detect_model.predict(X_test)

In [26]:
score=accuracy_score(y_test,y_pred)
print(score)

0.8823529411764706


In [27]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      0.88      0.94        17
           1       0.00      0.00      0.00         0

    accuracy                           0.88        17
   macro avg       0.50      0.44      0.47        17
weighted avg       1.00      0.88      0.94        17



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
