In [2]:
import pandas as pd

url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

df.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [34]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df.iloc[12]['message']

'URGENT! You have won a 1 week FREE membership in our £100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18'

In [29]:
df.head(20)

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


In [7]:
X = df['message']
y = df['label']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [17]:
model = LogisticRegression()
model.fit(X_train_vec, y_train)


In [18]:
y_pred = model.predict(X_test_vec)

In [19]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9883408071748879

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       1.00      0.91      0.95       149

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [48]:
test_email = ["""Dear friends,

Since we launched our Website Visitor ID Accuracy Test this month, dozens of brands have stepped up to put their current data providers to the test.

Why?

Because 2025 is shaping up to be a tough year for growing a brand.

Consumer spending is cooling. Meta ROAS is getting squeezed. And the steady disappearance of third-party tracking signals is making scale harder than ever.

Marketers are fighting enough headwinds. Unfortunately, we recently made public another challenge that's negatively impacting brands: the disturbingly low accuracy rate of anonymous website identity resolution providers.

This finding became apparent based on testing with multiple DTC brands, and we will continue to publish these test results and share them with the community.

Take this recent example from Modern Blaze, a leading home goods store on Shopify.

They ran a controlled test, comparing the performance and real business impact of two website visitor identification platforms: Customers.ai and Opensend.

Here’s what they found:

    Opensend’s contacts caused 1.6x higher unsubscribe rates and 2.9x more spam complaints.
    Zero revenue from Opensend-identified contacts. Not one sale.
    In contrast, Customers.ai delivered a 2.5x higher click rate and over $10k in revenue that would have otherwise been lost to cookie deprecation.

Results like these are why we’re on a mission at Customers.ai to raise the bar.

Marketers deserve better than vague promises and 5-30% accuracy. Transparency and accuracy should be the standard — not the exception.

The good news: marketers who demand better will come out stronger on the other side.

If you’d like to test your current data provider and see how they measure up, we’d love to help.
 
Test Your Visitor ID Data

We’re building a movement for better data. And we’re building it with operators like you.

P.S. I invite you to join the industry conversation happening on LinkedIn. Let your voice and perspective be heard.

P.P.S. Our friends at Replo invited me on their podcast to unpack the findings of dozens of website visitor ID data quality tests. If you're interested in understanding what's happening in our space, you can tune in here.

Sincerely,
Larry Kim
CEO & Founder
Customers.ai
"""]
test_email_features = vectorizer.transform(test_email)

In [49]:
predict_spam = model.predict(test_email_features)

In [50]:
predict_spam

array([1])