In [21]:
# If you ran task 2 you don't need to re-clone the directory. You only need to mount your drive.

from google.colab import drive
drive.mount('/content/drive') # Mount your drive

# Skip these lines below if you already cloned.
%cd /content/drive/MyDrive/
!git clone https://github.com/yusefmustafa/CS-301-Assignment-3.git # Clone my repo which has the TubeSpam dataset

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive
Cloning into 'CS-301-Assignment-3'...
remote: Enumerating objects: 58, done.[K
remote: Counting objects: 100% (58/58), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: Total 58 (delta 15), reused 52 (delta 9), pack-reused 0[K
Unpacking objects: 100% (58/58), done.
Checking out files: 100% (25/25), done.


In [22]:
!pip install lime # Install LIME



In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Feel free to uncomment the other datasets and change directories as needed.
# We included only one dataset as per TA's instructions.
tube_spam_dataset_locations = ['/content/drive/MyDrive/CS-301-Assignment-3/TubeSpamDataset/Youtube01-Psy.csv']
                              #  '/content/drive/MyDrive/CS-301-Assignment-3/TubeSpamDataset/Youtube02-KatyPerry.csv',
                              #  '/content/drive/MyDrive/CS-301-Assignment-3/TubeSpamDataset/Youtube03-LMFAO.csv', 
                              #  '/content/drive/MyDrive/CS-301-Assignment-3/TubeSpamDataset/Youtube04-Eminem.csv', 
                              #  '/content/drive/MyDrive/CS-301-Assignment-3/TubeSpamDataset/Youtube05-Shakira.csv']

print("Combining all datasets...")
data = []
for dataset_location in tube_spam_dataset_locations:
  data.append(pd.read_csv(dataset_location))
  print("Added:  " + dataset_location)

data = pd.concat(data, axis=0, ignore_index=True) # Collect all data from CSVs into one dataframe

X = list(data['CONTENT'])
y = list(data['CLASS'])# 0 = ham, 1 = spam

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3) # Reserve 30% of the data for training

# Create our bag of words using CountVectorizer
print("\nCreating bag-of-words...")
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             ngram_range = (1, 3),
                             max_features = 1000) 

train_data_features = vectorizer.fit_transform(X_train)
train_data_features = train_data_features.toarray()

print("Found words: " + str(vectorizer.get_feature_names()))
print("Counting word frequency... ")
# We can determine how important a word is to a document with a frequency counter
tfidf = TfidfTransformer()
tfidf_features = tfidf.fit_transform(train_data_features).toarray()

# Train logistic regression model
print("Training logistic regression model... ")
lr = LogisticRegression()
lr.fit(tfidf_features, y_train)

print("Testing model against X_test...")
test_data_features = vectorizer.transform(X_test).toarray()
test_data_tfidf_features = tfidf.fit_transform(test_data_features).toarray()

predicted_y = lr.predict(test_data_tfidf_features)
is_y_prediction_correct = predicted_y == y_test

spam_detection_accuracy = np.mean(is_y_prediction_correct) * 100
print ('Accuracy: ' + str(spam_detection_accuracy) + '%')

Combining all datasets...
Added:  /content/drive/MyDrive/CS-301-Assignment-3/TubeSpamDataset/Youtube01-Psy.csv

Creating bag-of-words...
Found words: ['000', '000 000', '000 000 000', '100', '100 subscribers', '100 subscribers will', '127', '1415297812', '2014', '2015', '2billion', '2billion hits', '48051', '5million', '8bit', 'about', 'about to', 'absolutely', 'after', 'again', 'alive', 'all', 'all about', 'all of', 'alone', 'also', 'am', 'am new', 'am new youtuber', 'amazon', 'amp', 'amp theater', 'an', 'and', 'and check', 'and check out', 'and even', 'and get', 'and give', 'and please', 'and please subscribe', 'and the', 'and will', 'and will subscribe', 'any', 'any other', 'anymore', 'anyone', 'appreciate', 'appreciate if', 'apps', 'are', 'as', 'asia', 'asian', 'ass', 'at', 'at salon', 'at the', 'auditioning', 'auditioning for', 'away', 'awesome', 'awesome and', 'back', 'be', 'beautiful', 'because', 'because they', 'before', 'best', 'billion', 'billion views', 'billion views this',