In [79]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

%matplotlib inline

## Aliens Posts

In [80]:
aliens_posts = pd.read_csv('./datasets/aliens_posts.csv')

In [81]:
aliens_posts.head(3)

Unnamed: 0,subreddit,selftext,title
0,aliens,,A man records the moment when an elongated UFO...
1,aliens,,How many people saw Dr. Greer's appearance on ...
2,aliens,,"Los mismos OVNIs del Pentágono, son noticia ot..."


## Space Posts

In [82]:
space_posts = pd.read_csv('./datasets/space_posts.csv')

In [83]:
space_posts.head(3)

Unnamed: 0,subreddit,selftext,title
0,space,The title might be misleading but I'm not sure...,"Speed in space, how do you determine at what s..."
1,space,,https://youtube.com/user/dralansari2010
2,space,[removed],"Space Telemedicine, Telesurgery"


### Aliens Comments

In [84]:
aliens_comments = pd.read_csv('./datasets/aliens_comments.csv')

In [85]:
aliens_comments.head(3)

Unnamed: 0,subreddit,body
0,aliens,Yeah an alien from the planet MS-13.
1,aliens,I like drones. I recently went on ebay to buy ...
2,aliens,I don't get it.


### Space Comments

In [86]:
space_comments = pd.read_csv('./datasets/space_comments.csv')

In [87]:
space_comments.head(3)

Unnamed: 0,subreddit,body
0,space,I don't think they even had tutorials when I s...
1,space,It's High Roller by Chrystal Method. Whenever ...
2,space,[removed]


### Check for missing values

##### Aliens

In [88]:
aliens_posts.isna().sum()

subreddit        0
selftext     15569
title            0
dtype: int64

In [89]:
aliens_posts = aliens_posts.dropna(axis=0, how='any')

In [90]:
aliens_posts.shape

(13816, 3)

In [91]:
aliens_posts.head(3)

Unnamed: 0,subreddit,selftext,title
4,aliens,https://youtu.be/LK-AGLpjcCA,"REAL MEN IN BLACK SIGHTINGS, HUMAN OR ALIENS"
9,aliens,Recently Pentagon officially shared the leaked...,SERIOUSLY
14,aliens,This happened over the summer as I was coming ...,My Morphing UFO Encounter


##### Space

In [92]:
space_posts = space_posts.dropna(axis=0, how='any')

In [93]:
space_posts.shape

(6815, 3)

##### Comments

In [94]:
aliens_comments = aliens_comments.dropna(axis=0, how='any')

In [95]:
aliens_comments.shape

(30000, 2)

In [96]:
space_comments = space_comments.dropna(axis=0, how='any')

In [97]:
space_comments.shape

(30000, 2)

In [98]:
posts = aliens_posts.append(space_posts, ignore_index=True)

In [99]:
posts['subreddit'].unique()

array(['aliens', 'space'], dtype=object)

#### Explore and clean data

In [100]:
X = posts[['selftext']]
y = posts['subreddit']

In [101]:
X.head(3)

Unnamed: 0,selftext
0,https://youtu.be/LK-AGLpjcCA
1,Recently Pentagon officially shared the leaked...
2,This happened over the summer as I was coming ...


In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify=y)

In [103]:
# Import CountVectorizer.
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate CountVectorizer.
cvec = CountVectorizer(max_features = 500, stop_words = 'english')

In [104]:
X_train['selftext'].shape

(15473,)

In [105]:
cvec.fit_transform(X_train['selftext']).todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [106]:
# Fit CountVectorizer on the training data and transform training data.
X_train_cvec = pd.DataFrame(
    cvec.fit_transform(X_train['selftext']).todense(),
    columns = cvec.get_feature_names()
)

In [107]:
# Transform testing data with the already-fit CountVectorizer.
X_test_cvec = pd.DataFrame(
    cvec.transform(X_test['selftext']).todense(),
    columns = cvec.get_feature_names()
)

In [108]:
X_train_cvec.head()

Unnamed: 0,000,00100000,01100101,10,100,12,15,20,2020,2021,...,world,wouldn,wrong,www,x200b,year,years,yes,youtu,youtube
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


Fit Naive Bayers model

In [109]:
from sklearn.naive_bayes import MultinomialNB

In [110]:
nb = MultinomialNB()

In [111]:
model = nb.fit(X_train_cvec, y_train)

In [112]:
predictions = model.predict(X_test_cvec)

In [113]:
X_train_cvec.shape

(15473, 500)

In [114]:
y_train.shape

(15473,)

In [115]:
model.score(X_train_cvec, y_train)

0.8087636528145803

In [116]:
model.score(X_test_cvec, y_test)

0.8008918185343157

In [117]:
from sklearn.metrics import confusion_matrix

In [118]:
confusion_matrix(y_test, predictions)

array([[3121,  333],
       [ 694, 1010]], dtype=int64)

In [121]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

In [122]:
print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")

True Negatives: 3121
False Positives: 333
False Negatives: 694
True Positives: 1010
