### Imports

Make sure to save a file named api_key.txt in your code directory with your API key to use the YouTube API. You can get this from https://console.cloud.google.com/. Instructions: https://blog.hubspot.com/website/how-to-get-youtube-api-key

In [19]:
from googleapiclient.discovery import build
import random
import string
import matplotlib.pyplot as plt 
import pandas as pd
import re
import wordninja
import pickle
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer

f = open("api_key.txt", "r")
api_key = f.read()
youtube = build('youtube', 'v3', developerKey=api_key)

### Getting Data

In [7]:
def get_video_comments(video_id):
    # empty list to hold comment text
    comments = []
    
    # retrieve first 100 comments data
    video_response = youtube.commentThreads().list(
    part='snippet, replies',
    videoId=video_id,
    textFormat='plainText',
    maxResults=100).execute()
    
    # iterate video response
    while video_response:
        for item in video_response['items']:
            #extract comment
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)
            
            # working on replies
            reply_number = item['snippet']['totalReplyCount']
            if reply_number > 0:
                if 'replies' in item:
                    for reply in item['replies']['comments']:
                        comment = reply['snippet']['textDisplay']
                        comments.append(comment)
                        
        # get the next 100 comments
        if 'nextPageToken' in video_response:
            video_response = youtube.commentThreads().list(
                part='snippet, replies',
                videoId=video_id,
                textFormat='plainText',
                pageToken=video_response['nextPageToken'],
                maxResults=100).execute()
        else:
            break
    return comments

def get_comments_from_videos(video_ids_list, max_per_video=200):
    all_comments = []
    for video_id in video_ids_list:
        comments = get_video_comments(video_id)
        comments = random.sample(comments, min(max_per_video, len(comments)))
        all_comments.extend(comments)
    df = pd.DataFrame(all_comments, columns = ["comment"])
    df.to_csv("comments.csv", index=False)

In [8]:
videos = [
    'lDKy8S1QTyM', # Graham Stephan
    'Bi2m3egJWGc', # Andrei Jikh
    'YZxeQ7xiOyU', # Nate O'Brien
    'uLgh91wO86Y', # Biaheza
    'Ui5r9wRFL2M', # Tech Lead
    '0y4NkoI4lYw', # Altcoin Daily
]
# get_comments_from_videos(videos)

### Clean Comment

First, I went into the CSV file and added a column called "Label". I labelled each scam comment as 1 and normal as 0.

In [None]:
df = pd.read_csv("comments.csv")
df["clean_comment"] = df["comment"].apply(lambda s: re.sub(r'[^A-Za-z0-9 ]+', '', s))
df["clean_comment"] = df["clean_comment"].apply(lambda s: ' '.join(wordninja.split(s)))
df = df.drop_duplicates(subset="clean_comment")
df.to_csv("comment_dataset.csv", index=False)

In [26]:
df2 = pd.read_csv("comment_dataset.csv")

In [27]:
df2

Unnamed: 0,Comment,Label,clean_comment
0,F.o.r m.o.r.e g.u.i.d.a.n.c.e\n+.\n1...-3...-8...,1,For more guidance 13857774398 WHAT S APP Thank...
1,•T•h•a•n•k•s f•o•r w*a•t•c•h•I•n•g• •f•o•r c•r...,1,Thanks for watchIng for crypto investment guid...
2,Text on telegram,1,Text on telegram
3,*Thanks for the feedback~ ~for more Info.*\n*•...,1,Thanks for the feedback for more Info WHAT SAP...
4,T.h.a.n.k.s f.o.r w.a.t.c.h.i.n.g \nF.o.r m.o...,1,Thanks for watching For more guidance Write me...
...,...,...,...
383,@TechLead Thank you for your reply! Congrats o...,0,Tech Lead Thank you for your reply Congrats on...
384,*blocks football conference to hide insider tr...,0,blocks football conference to hide insider tra...
385,"but when you die none of it will matter, we al...",0,but when you die none of it will matter we all...
386,If you like government regulation and involvem...,0,If you like government regulation and involvem...


In [28]:
def edit_comment(comment):
    return comment.lower()

In [29]:
df2['clean_comment'] = df2['clean_comment'].apply(edit_comment)

### NLP

In [48]:
X_train, X_test, y_train, y_test = train_test_split(df2['clean_comment'], df2['Label'], test_size = 0.01, random_state = 1)
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)

# save vectorizer file
pickle.dump(vectorizer, open("vectorizer", 'wb'))

In [49]:
# train model on data
svm_model = svm.SVC(C=1000)
svm_model.fit(X_train, y_train)

# save ML model
pickle.dump(svm_model, open("scam_model", 'wb'))

In [50]:
X_test = vectorizer.transform(X_test)
y_pred = svm_model.predict(X_test)
print(confusion_matrix(y_test, y_pred))

[[1 0]
 [0 3]]


### Classification

In [36]:
def is_scam(comment):
    # load model and vectorizer
    svm_model = pickle.load(open("scam_model", 'rb'))
    vectorizer = pickle.load(open("vectorizer", 'rb'))
    
    # clean comment
    clean_comment = re.sub(r'[^A-Za-z0-9 ]+', '', comment)
    clean_comment = ' '.join(wordninja.split(clean_comment))
    clean_comment = edit_comment(clean_comment)
    
    # transform using vectorizer
    comment_transformed = vectorizer.transform([clean_comment])
    
    # generate prediction
    pred = svm_model.predict(comment_transformed)
    return pred[0] == 1

In [45]:
df2["pred"] = df2["comment"].apply(is_scam)
incorrect = df2.loc[df2["pred"] != df2["Label"]]

In [46]:
incorrect

Unnamed: 0,Comment,Label,clean_comment,pred
18,T•E•L•E•G•R•A•M\n\n@channel_Rep✓.,1,telegram channel rep,False
54,".+/ 1/ 7/ 6/ 0/ 8/ 3/ 6/ 6/ 5/ 8/ 6\n..,",1,1 7 6 0 8 3 6 6 5 8 6,False
65,I REALLY APPRECIATE YOUR SERVICES HACKERRAID O...,1,i really appreciate your services hacker raid ...,False
67,"@misskerry \rI worked with Alisan P Martin, Sh...",1,miss kerry i worked with al is an p martin she...,False
