<a href="https://colab.research.google.com/github/wolfzxcv/ml-examples/blob/master/intent_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pandas import read_csv

url = 'https://raw.githubusercontent.com/wolfzxcv/ml-examples/master/IC.csv'
dataset=read_csv(url, skiprows=1, header=None)
sentences = dataset.iloc[:, 0]
intentions = dataset.iloc[:, 1]

In [None]:
print(sentences)

0                        How is the weather today
1                              Will it rain today
2                What's the forecast for tomorrow
3                             Is it sunny outside
4                     Do I need an umbrella today
                          ...                    
109        What should I do if I lose my passport
110                  How do I navigate a new city
111       What's the best way to travel with kids
112                        How do I avoid jet lag
113    What are the must-see attractions in Paris
Name: 0, Length: 114, dtype: object


In [None]:
# document https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
from sklearn.feature_extraction.text import CountVectorizer
feature_extractor = CountVectorizer(analyzer="word", ngram_range=(1, 2), binary=True,
            token_pattern=r'([a-zA-Z]+|\w)')
X = feature_extractor.fit_transform(sentences)

# Check how many n-grams do we get
len(feature_extractor.get_feature_names_out())
feature_extractor.get_feature_names_out()

array(['5', '5 kgs', 'a', 'a beginner', 'a budget', 'a chance', 'a day',
       'a fee', 'a flight', 'a great', 'a healthy', 'a heatwave',
       'a hotel', 'a membership', 'a month', 'a new', 'a perfect',
       'a pleasure', 'a trip', 'a visa', 'a workout', 'abs', 'access',
       'access member', 'account', 'activate', 'activate my', 'after',
       'after a', 'afternoon', 'air', 'air quality', 'all', 'all for',
       'an', 'an account', 'an international', 'an umbrella', 'any',
       'any discounts', 'any membership', 'any more', 'any severe',
       'apply', 'apply for', 'apps', 'are', 'are some', 'are the',
       'are there', 'are things', 'are you', 'arms', 'at', 'at the',
       'attractions', 'attractions in', 'avoid', 'avoid jet', 'be',
       'be cloudy', 'be tomorrow', 'be windy', 'beach', 'been',
       'beginner', 'beginner workout', 'belly', 'belly fat', 'benefits',
       'benefits of', 'best', 'best exercise', 'best time', 'best travel',
       'best way', 'book', '

In [None]:
print(len(feature_extractor.get_feature_names_out()))

617


In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty="l2", class_weight='balanced')
lr.fit(X, intentions)

In [None]:
# Transfor new input to n-gram
user_input = ['How crazy will it rain tomorrow']
X2 = feature_extractor.transform(user_input)
# Check what is the dimension
len(X2.toarray()[0])

617

In [None]:
# Check what is the feature extracted from this sentence
print(X2)

  (0, 226)	1
  (0, 305)	1
  (0, 311)	1
  (0, 434)	1
  (0, 547)	1
  (0, 597)	1
  (0, 598)	1


In [None]:
# Use the index we got, and check which feature do we get.
feature_names = feature_extractor.get_feature_names_out()
for index in X2.nonzero()[1]:
    print(feature_names[index])

how
it
it rain
rain
tomorrow
will
will it


In [None]:
# Then we can predit the intention
lr.predict(X2)

array(['weather'], dtype=object)

In [None]:
# We can also check the probability for each intention
probs = lr.predict_proba(X2)[0]
for predict_intent, prob in sorted(zip(lr.classes_, probs), key = lambda x: x[1],reverse = True):
    print(predict_intent, prob)

weather 0.7631856662596612
greeting 0.11833269969774443
fitness 0.03775597049072964
quit 0.03356810387146285
membership 0.026888331660646737
travel 0.02026922801975512
