In [36]:
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.svm import LinearSVC

In [37]:
import pandas as pd

In [38]:
cb=pd.read_csv("combined.csv")
labels=cb["label"].unique()
for label in labels:
    print(f'{label}: Total {len(cb.loc[cb["label"] == label])}')

cb = cb.sample(frac=1, random_state=42).reset_index(drop=True)

Social Proof: Total 625
Misdirection: Total 427
Urgency: Total 438
Forced Action: Total 6
Obstruction: Total 57
Sneaking: Total 24
Scarcity: Total 1095
Not Dark Pattern: Total 1178


In [39]:
# Cleaning data
import re

def clean_text(text):
    text=text.lower()
    text=re.sub("[^a-zA-Z0-9$ ]",'',text)
    return text

cb["text"]=cb.apply(lambda row: clean_text(row["text"]), axis=1)

In [40]:
cb

Unnamed: 0,text,label
0,limited time offer,Urgency
1,no thank you i prefer to pay full price,Misdirection
2,your shopping cart is empty,Not Dark Pattern
3,i want to pay more later,Misdirection
4,qwill this damage my hair,Not Dark Pattern
...,...,...
3845,hurry only 6 left in stock,Scarcity
3846,only 3 left,Scarcity
3847,only 2 units left in stock,Scarcity
3848,1446 people have viewed this item,Social Proof


In [41]:
# Remove stop words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
 
def remove_stop_words(text):
    text = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word not in stop_words]
    text = ["price" if "$" in word else word for word in text ]
    text = ["quantity" if word.isdigit() else word for word in text ]
    text = [word for word in text if word.isalpha()]
    return ' '.join(text)
     
cb["text"] = cb.apply(lambda row: remove_stop_words(row['text']), axis=1)

In [42]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
def lemmatize(text):
    words=text.split(" ")
    new_string=[]
    for word in words:
        new_string.append(wnl.lemmatize(word))
    return " ".join(new_string)


In [43]:
cb["text"] = cb.apply(lambda row: lemmatize(row['text']), axis=1)

In [44]:
# cb.to_csv("combined_cleaned.csv")
cb

Unnamed: 0,text,label
0,limited time offer,Urgency
1,thank prefer pay full price,Misdirection
2,shopping cart empty,Not Dark Pattern
3,want pay later,Misdirection
4,qwill damage hair,Not Dark Pattern
...,...,...
3845,hurry quantity left stock,Scarcity
3846,quantity left,Scarcity
3847,quantity unit left stock,Scarcity
3848,quantity people viewed item,Social Proof


In [45]:
# Converting text to vectors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(cb["text"])
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

X_train_tf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 18373 stored elements and shape (3850, 3176)>

In [46]:
# mappings={x:i for (i,x) in enumerate(cb["label"].unique())}
# reverse_mapping={i:x for (i,x) in enumerate(cb["label"].unique())}
# mappings
mappings={'Urgency': 0,
 'Misdirection': 1,
 'Not Dark Pattern': 2,
 'Scarcity': 3,
 'Obstruction': 4,
 'Social Proof': 5,
 'Sneaking': 6,
 'Forced Action': 7}
reverse_mapping={x:i for (i,x) in mappings.items()}
reverse_mapping

{0: 'Urgency',
 1: 'Misdirection',
 2: 'Not Dark Pattern',
 3: 'Scarcity',
 4: 'Obstruction',
 5: 'Social Proof',
 6: 'Sneaking',
 7: 'Forced Action'}

In [47]:
y_train=cb.apply(lambda row: mappings[row["label"]], axis=1)
y_train

0       0
1       1
2       2
3       1
4       2
       ..
3845    3
3846    3
3847    3
3848    5
3849    2
Length: 3850, dtype: int64

In [48]:
clf = LinearSVC(dual="auto") #MultinomialNB(force_alpha=True)
train=3500
test=3800
sample=3600
clf.fit(X_train_tf[0:train],y_train[0:train])

In [49]:
predicted = clf.predict(X_train_tf[train+1:test])
clf.predict(X_train_tf[sample])

array([2], dtype=int64)

In [50]:
from sklearn.metrics import accuracy_score
acc=accuracy_score(y_true=y_train[train+1:test], y_pred=predicted)
print(acc)

0.9866220735785953


In [51]:
cb["text"][train+1:test], cb["label"][train+1:test]

(3501         sign newsletter receive news offer promotion
 3502    quantity year subscription auto renews fee rev...
 3503                 baby cross stitch embroidery pattern
 3504                          thanks dont like free stuff
 3505                                      might also like
                               ...                        
 3795                                                 name
 3796                                        quantity left
 3797                               vip offer end quantity
 3798                                 thanks id rather pay
 3799                                    work buyer seller
 Name: text, Length: 299, dtype: object,
 3501    Not Dark Pattern
 3502            Sneaking
 3503    Not Dark Pattern
 3504        Misdirection
 3505    Not Dark Pattern
               ...       
 3795    Not Dark Pattern
 3796            Scarcity
 3797             Urgency
 3798        Misdirection
 3799    Not Dark Pattern
 Name: label, Length: 29

In [52]:
cb["text"][sample]

'womens summer new clerk large size loose cotton linen blouse'

In [53]:
predicted = clf.predict(X_train_tf[sample])

In [54]:
predicted

array([2], dtype=int64)

In [55]:
reverse_mapping[predicted[0]]

'Not Dark Pattern'

In [56]:
# Save model
import joblib
joblib.dump(clf,f"Partial_Lemmatize_test{acc:.2f}_SVC.joblib",2)

['Partial_Lemmatize_test0.99_SVC.joblib']

In [57]:
joblib.dump(count_vect, "countvect_without_lemma.joblib",2)

['countvect_without_lemma.joblib']

In [58]:
from sklearn.pipeline import Pipeline
# text_clf = Pipeline([
#     ('vect', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('clf', MultinomialNB()),
# ])

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('svc', LinearSVC()),
])

text_clf.fit(cb["text"], y_train)
predicted = text_clf.predict(cb["text"][train+1:test])
print(predicted)
acc=accuracy_score(y_true=y_train[train+1:test], y_pred=predicted)
joblib.dump(text_clf, f"LemmatiseTest_SVCpipe{acc}.joblib", 2)
print(acc)


[2 6 2 1 2 1 2 3 2 5 3 1 0 2 2 5 5 1 2 1 2 5 1 2 4 0 0 5 2 5 1 0 5 5 3 1 5
 2 3 3 5 0 2 2 6 3 1 2 1 5 3 1 0 5 2 2 0 5 2 5 5 0 1 5 1 5 3 3 2 1 0 3 2 2
 2 3 2 3 5 3 5 3 3 2 4 5 2 3 2 2 2 2 3 2 2 3 2 5 5 2 3 2 2 2 2 2 6 3 3 2 1
 4 1 1 3 3 3 2 3 2 3 3 3 3 3 2 1 2 2 2 5 5 3 2 2 3 0 3 3 5 1 2 2 0 2 2 2 5
 2 3 2 3 0 5 5 2 2 0 0 0 3 5 3 0 0 2 3 2 1 5 2 1 3 5 1 2 3 1 3 2 2 3 2 1 2
 3 2 2 2 1 1 3 0 3 2 0 2 1 3 0 5 5 2 2 2 0 1 2 3 1 2 1 3 1 2 5 3 2 2 3 2 3
 3 2 5 6 3 2 0 1 1 0 2 0 3 3 1 2 5 3 3 3 1 2 3 2 3 2 5 5 1 0 2 3 2 3 2 3 3
 5 2 2 2 2 2 5 0 2 2 1 2 3 2 3 1 3 2 5 2 5 2 2 0 2 3 3 5 1 4 2 5 5 2 3 2 3
 0 1 2]
1.0


In [59]:
text_clf.predict([cb["text"][sample]])

array([2], dtype=int64)