In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from gensim.models.word2vec import Word2Vec
from sklearn.pipeline import Pipeline
from collections import Counter

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, f1_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier


# Problem Statement

Our client, DC Entertainment is looking to roll out a new advertising campaign to take on its rival Marvel Studios. Their aim is to increase their sales and grow their fanbase. Reddit is a known hangout for movie fans where they post memes, discussions of movie plots and characters etc. It is important for DC Entertainment to clasify the users as Marvel or DC fans to determine the kind of advertisements to be displayed to the users. 

The goal of our project is to help our client in their advertising campaign by:
- Classifying the subreddits as Marvel or DC
- Providing recommendations to our client for their advertising campaign by suggesting the the most popular words used in DC subreddit. 

Our solution will to use a combination of NLP and Classification models. We will use NLP tools such as CountVectorizer, TFIDF Vectoriser and classification models such as Logistics Regression, Random Forests and Multinomial Naive Bayes, to classify the subreddit posts in the right category. Our model will be able to analyse an incoming post and categorise it into the correct category as Marvel or DC. Success will be measured by the accuracy score of our model. 

Stakeholders: 
- Primary: 
    - DC Entertainment 
    
- Secondary: 
    - Internet users of DC_Cinematic and MarvelStudio subreddits



# 1. Data Collection, fetching posts from PushShift API

In [6]:
def get_subreddit(subreddit, count):
    # Call the api 'count' number of times by passing the created_utc time and 
    # getting 100 posts before that time

    url = 'https://api.pushshift.io/reddit/search/submission'
    
    params ={
    'subreddit': subreddit,
    'size':100,
    }
    
    df_posts = pd.DataFrame()
    for i in range(0,count):
        if (i > 0):
            params['before'] = df_posts['created_utc'][len(df_posts)-1]
            
        res = requests.get(url, params)
        data = res.json()
        df_posts = df_posts.append(data['data'], ignore_index=True)
        print(df_posts.shape)
    
    return df_posts

In [28]:
# Fetch records for marvelstudios subreddit
df_marvel = get_subreddit('marvelstudios', 35)

(100, 79)
(200, 79)
(300, 81)
(400, 82)
(500, 82)
(600, 83)
(700, 83)
(800, 83)
(900, 83)
(1000, 84)
(1100, 84)
(1200, 84)
(1300, 84)
(1400, 85)
(1500, 86)
(1600, 86)
(1700, 86)
(1800, 86)
(1900, 86)
(2000, 86)
(2100, 86)
(2200, 86)
(2300, 86)
(2400, 86)
(2500, 86)
(2600, 86)
(2700, 86)
(2800, 86)
(2900, 86)
(3000, 86)
(3100, 86)
(3200, 86)
(3300, 86)
(3400, 86)
(3500, 86)


In [34]:
df_marvel.to_csv('../data/marvel.csv', index=False)

In [10]:
# Fetch records for DC_Cinematic subreddit

df_DC = get_subreddit('DC_Cinematic', 30)

(100, 79)
(200, 80)
(300, 81)
(400, 82)
(500, 82)
(600, 82)
(700, 83)
(800, 83)
(900, 83)
(1000, 83)
(1100, 83)
(1200, 83)
(1300, 83)
(1400, 83)
(1500, 83)
(1600, 83)
(1700, 83)
(1800, 83)
(1900, 83)
(2000, 83)
(2100, 83)
(2200, 83)
(2300, 83)
(2400, 83)
(2500, 83)
(2600, 83)
(2700, 83)
(2800, 83)
(2900, 83)
(3000, 83)


In [35]:
df_DC.to_csv('../data/DC.csv', index=False)