In [1]:
import pandas as pd

In [2]:
spam = pd.read_csv("vectorizedSpam.csv")
spam.drop(["Unnamed: 0"], inplace=True, axis=1)

In [3]:
# Get a list of stop words from https://gist.github.com/sebleier/554280
with open('stopwords.txt') as file:
    lines = [line.rstrip() for line in file]

def getTopSpam(df, commonWords, num):
  '''
  Returns the most common 'num' words in the spam messages.
  '''
  spam = {}
  ham = []

  # Gets count of all words in spam that are not in the list of common words
  for idx in range(len(df.index)):
    message = df.iat[idx, 1]
    words = message.split()
    for word in words:
      if word not in commonWords and df.iat[idx, 0] == "spam":
        if word not in spam:
          spam[word] = 1
        elif word in spam:
          spam[word] = spam[word] + 1
   
  spam = sorted(spam.items(), key=lambda x:x[1], reverse=True)
  return dict(spam[:num])

result = getTopSpam(spam, lines, 5)
print(result)

{'call': 342, 'free': 180, '2': 169, 'ur': 144, 'txt': 136}


In [4]:
def testTrainSplit(df):
  length = len(spam.index)
  split_limit = int(length * 0.7)
  train = spam[0:split_limit]
  test = spam[split_limit:length]
  return train, test

train, test = testTrainSplit(spam)

In [5]:
def makeVectorTable(df):
  '''
  This function displays the message vectors like the stolen car data table seen
  in our first lecture on Naive Bayes.
  '''
  feature_records = []
  for idx in range(len(df)):
    feature_vector = []
    msg = df.iat[idx, 1]
    label = df.iat[idx, 0]
    for word in result.keys():
      feature_vector.append(1 if word in msg else 0)
    feature_vector.append(label)
    feature_records.append(feature_vector)
  feature_df = pd.DataFrame(feature_records)
  columns = list(result.keys())
  columns.append('label')
  feature_df.columns = columns
  return feature_df
  
feature_df = makeVectorTable(train)
feature_df.head()

Unnamed: 0,call,free,2,ur,txt,label
0,0,0,0,1,0,ham
1,0,0,0,0,0,ham
2,0,1,1,0,1,spam
3,0,0,0,0,0,ham
4,0,0,0,0,0,ham


In [6]:
def getProbHamSpam(df, feature_df):
  '''
  Gets the overall probability of ham and spam labels in dataframe
  '''
  p_spam = feature_df[feature_df.label == 'spam'].label.count() / feature_df.shape[0]
  p_ham = feature_df[feature_df.label == 'ham'].label.count() / feature_df.shape[0]
  return (p_spam, p_ham)

pSpam, pHam = getProbHamSpam(spam, feature_df)
print(pSpam, pHam)

0.13307692307692306 0.8669230769230769


In [7]:
def getSpamWordProbs(feature_df):
  ''' 
  Returns a dictionary of probabilities of 'top' words in messages labeled as spam.
  '''
  spam_df = feature_df[feature_df.label == 'spam']
  spam_word_counts = spam_df[result.keys()].sum(axis=0)
  spam_probs = spam_word_counts / spam_df.shape[0]
  spam_probs = spam_probs.to_dict()
  return spam_probs

getSpamWordProbs(feature_df)

{'call': 0.44315992292870904,
 'free': 0.2678227360308285,
 '2': 0.7398843930635838,
 'ur': 0.5953757225433526,
 'txt': 0.26396917148362237}

In [8]:
def getHamWordProbs(feature_df):
  ''' 
  Returns a dictionary of probabilities of 'top' words in messages labeled as spam.
  '''
  spam_df = feature_df[feature_df.label == 'ham']
  spam_word_counts = spam_df[result.keys()].sum(axis=0)
  spam_probs = spam_word_counts / spam_df.shape[0]
  spam_probs = spam_probs.to_dict()
  return spam_probs

getHamWordProbs(feature_df)

{'call': 0.06033717834960071,
 'free': 0.013309671694764862,
 '2': 0.06595681750961255,
 'ur': 0.21177166518781426,
 'txt': 0.0026619343389529724}

In [10]:
feature_df

Unnamed: 0,Label,Message,Vectors
3900,ham,that depends. how would you like to be treated...,"[0, 0, 0, 0, 0]"
3901,ham,"right on brah, see you later","[0, 0, 0, 0, 0]"
3902,ham,waiting in e car 4 my mum lor. u leh? reach ho...,"[0, 0, 0, 0, 0]"
3903,spam,your 2004 account for 07xxxxxxxxx shows 786 un...,"[1, 0, 0, 0, 0]"
3904,spam,do you want a new video handset? 750 anytime a...,"[1, 0, 0, 0, 0]"
...,...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...,"[1, 0, 1, 0, 0]"
5568,ham,will �_ b going to esplanade fr home?,"[0, 0, 0, 0, 0]"
5569,ham,"pity, * was in mood for that. so...any other s...","[0, 0, 0, 0, 0]"
5570,ham,the guy did some bitching but i acted like i'd...,"[0, 1, 0, 0, 0]"
