In [35]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from joblib import dump

In [36]:
select = "Pattern Category"

df = pd.read_csv('dataset.csv')

df = df[pd.notnull(df['text'])]
df = df[df["Pattern Category"] != "Not Dark Pattern"]
col = ["text", select]
df=df[col]
df

Unnamed: 0,text,Pattern Category
0,FLASH SALE | LIMITED TIME ONLY Shop Now,Urgency
5,In demand,Scarcity
7,Hurry! Only 2 left in stock,Scarcity
8,In Stock only 3 left,Scarcity
11,Hurry! Only 2 left,Scarcity
...,...,...
2345,"No thanks, I like paying full price",Misdirection
2349,ONLY 9 LEFT,Scarcity
2350,"No Thanks, I don't want official software at t...",Misdirection
2353,Only 4 left in stock,Scarcity


In [37]:
df["category_id"] = df[select].factorize()[0]
df

Unnamed: 0,text,Pattern Category,category_id
0,FLASH SALE | LIMITED TIME ONLY Shop Now,Urgency,0
5,In demand,Scarcity,1
7,Hurry! Only 2 left in stock,Scarcity,1
8,In Stock only 3 left,Scarcity,1
11,Hurry! Only 2 left,Scarcity,1
...,...,...,...
2345,"No thanks, I like paying full price",Misdirection,2
2349,ONLY 9 LEFT,Scarcity,1
2350,"No Thanks, I don't want official software at t...",Misdirection,2
2353,Only 4 left in stock,Scarcity,1


In [38]:
category_id_df = df[[select, 'category_id']
                    ].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(
    category_id_df[['category_id', select]].values)

In [39]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2',
                        encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(df["text"]).toarray()
labels = df.category_id

In [71]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df[select], train_size=.275)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf, y_train)

y_pred = clf.predict(count_vect.transform(X_test))

metrics.accuracy_score(y_pred, y_test)

0.9146198830409357

In [72]:
dump(clf,'cat_class.joblib')
dump(count_vect,'cat_vector.joblib')

['cat_vector.joblib']