# Continued from proj3_part1

## 1. Import relevant libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

## 2. Read data from Aliens Posts

In [2]:
aliens_posts = pd.read_csv('./datasets/aliens_posts.csv')

In [3]:
aliens_posts.head(3)

Unnamed: 0,subreddit,selftext,title
0,aliens,,A man records the moment when an elongated UFO...
1,aliens,,How many people saw Dr. Greer's appearance on ...
2,aliens,,"Los mismos OVNIs del Pentágono, son noticia ot..."


## 3. Read data from Space Posts

In [4]:
space_posts = pd.read_csv('./datasets/space_posts.csv')

In [5]:
space_posts.head(3)

Unnamed: 0,subreddit,selftext,title
0,space,,Long March 5B rolled out for crewed spacecraft...
1,space,,The Pentagon Officially Releases Images of 'UF...
2,space,Assuming an Earth-like night sky on a supposed...,"Question: at night on Mars, is it possible to ..."


## 4. Read data from Aliens Comments

In [6]:
aliens_comments = pd.read_csv('./datasets/aliens_comments.csv')

In [7]:
aliens_comments.head(3)

Unnamed: 0,subreddit,body
0,aliens,Yeah an alien from the planet MS-13.
1,aliens,I like drones. I recently went on ebay to buy ...
2,aliens,I don't get it.


## 5. Read data from Space Comments

In [8]:
space_comments = pd.read_csv('./datasets/space_comments.csv')

In [9]:
space_comments.head(3)

Unnamed: 0,subreddit,body
0,space,[removed]
1,space,[removed]
2,space,[removed]


## 6. Check for missing values

### 6a. Data cleaning for reddit posts

##### Aliens

In [10]:
aliens_posts.isna().sum()

subreddit        0
selftext     15569
title            0
dtype: int64

In [11]:
aliens_posts = aliens_posts.dropna(axis=0, how='any')

In [12]:
aliens_posts.shape

(13816, 3)

In [13]:
aliens_posts.head(3)

Unnamed: 0,subreddit,selftext,title
4,aliens,https://youtu.be/LK-AGLpjcCA,"REAL MEN IN BLACK SIGHTINGS, HUMAN OR ALIENS"
9,aliens,Recently Pentagon officially shared the leaked...,SERIOUSLY
14,aliens,This happened over the summer as I was coming ...,My Morphing UFO Encounter


##### Space

In [14]:
space_posts = space_posts.dropna(axis=0, how='any')

In [15]:
space_posts.shape

(15095, 3)

##### Comments

In [16]:
aliens_comments = aliens_comments.dropna(axis=0, how='any')

In [17]:
aliens_comments.shape

(99999, 2)

In [18]:
space_comments = space_comments.dropna(axis=0, how='any')

In [19]:
space_comments.shape

(100000, 2)

In [20]:
# posts = aliens_posts.append(space_posts, ignore_index=True)

In [21]:
# posts = aliens_posts.append(space_posts, ignore_index=True)
# posts['subreddit'].unique()

### 6b. Cleaning data for lingos and links, other non-relevant texts

### Aliens

In [22]:
aliens_posts

Unnamed: 0,subreddit,selftext,title
4,aliens,https://youtu.be/LK-AGLpjcCA,"REAL MEN IN BLACK SIGHTINGS, HUMAN OR ALIENS"
9,aliens,Recently Pentagon officially shared the leaked...,SERIOUSLY
14,aliens,This happened over the summer as I was coming ...,My Morphing UFO Encounter
16,aliens,[removed],Alien Visitors seen after ringing found Brass ...
18,aliens,[removed],The Pentagon disclosure might not be what we t...
...,...,...,...
29377,aliens,&amp;#x200B;\n\nhttps://preview.redd.it/e8uqf9...,Aliens are ready !
29378,aliens,"OK, mods, with all due respect, I understand h...",An invite for alien AMA.
29381,aliens,I can't find any mentions of UFO information t...,Alien IT?
29382,aliens,A bunch of clickbait articles have claimed tha...,Is the UFO conference going to happen tomorrow...


In [23]:
aliens_posts = aliens_posts[~aliens_posts['selftext'].str.contains("removed")]

In [24]:
aliens_posts = aliens_posts[~aliens_posts['selftext'].str.contains("youtu.be")]

In [25]:
aliens_posts = aliens_posts[~aliens_posts['selftext'].str.contains("deleted")]

In [26]:
aliens_posts = aliens_posts[~aliens_posts['selftext'].str.contains("preview.redd.it")]

In [27]:
aliens_posts.shape

(9470, 3)

In [28]:
aliens_posts.head(10)

Unnamed: 0,subreddit,selftext,title
9,aliens,Recently Pentagon officially shared the leaked...,SERIOUSLY
14,aliens,This happened over the summer as I was coming ...,My Morphing UFO Encounter
19,aliens,So back in 2011 i was working at a grocery sto...,Title is strange things.
22,aliens,Does anyone remember the allegedly leaked NASA...,Does anyone remember the NASA footage
26,aliens,It was a few weeks ago. It was maybe 2:30 AM. ...,Finally saw a UFO
27,aliens,The most interesting thought along the topic t...,A point was just brought up. How come nobody c...
28,aliens,You hear it right. What if we find out we had ...,How much different would it have been if we fo...
29,aliens,Fake alien invasion?,Why tf do they keep calling them threats and s...
36,aliens,My friend sent me some amazing footage with re...,amazing footage!
38,aliens,"Ok, so I had this idea when I saw someone had ...",Did the Greys influence anime culture?


### Space

In [29]:
space_posts.head(10)

Unnamed: 0,subreddit,selftext,title
2,space,Assuming an Earth-like night sky on a supposed...,"Question: at night on Mars, is it possible to ..."
4,space,"Hi, my name is João, I'm 14 years and I'm from...",I'm a 14 year old boy who aspires to be an ast...
6,space,[removed],Hey does anyone know where and when Samantha C...
8,space,[removed],"We are masters of confinement, Ask Us Anything!"
9,space,[deleted],"This little point here is the Earth, look how ..."
10,space,Destined to become the first aircraft to attem...,NASA's Mars Helicopter officially has received...
11,space,[removed],"REAL MEN IN BLACK SIGHTINGS, ARE THEY HUMAN OR..."
17,space,https://preview.redd.it/7mxd9xmderv41.jpg?widt...,First ever picture taken from space
28,space,"Possibly with a bar, where strangers could mee...",Space Force - Will there be Space Station 9 ?
31,space,In the light of the Pentagon declaring the leg...,A discussion on how gravity drives in UFOs cou...


In [30]:
space_posts = space_posts[~space_posts['selftext'].str.contains("removed")]

In [31]:
space_posts = space_posts[~space_posts['selftext'].str.contains("youtu.be")]

In [32]:
space_posts = space_posts[~space_posts['selftext'].str.contains("deleted")]

In [33]:
space_posts = space_posts[~space_posts['selftext'].str.contains("preview.redd.it")]

In [34]:
space_posts.shape

(9459, 3)

In [35]:
space_posts.head(10)

Unnamed: 0,subreddit,selftext,title
2,space,Assuming an Earth-like night sky on a supposed...,"Question: at night on Mars, is it possible to ..."
4,space,"Hi, my name is João, I'm 14 years and I'm from...",I'm a 14 year old boy who aspires to be an ast...
10,space,Destined to become the first aircraft to attem...,NASA's Mars Helicopter officially has received...
28,space,"Possibly with a bar, where strangers could mee...",Space Force - Will there be Space Station 9 ?
31,space,In the light of the Pentagon declaring the leg...,A discussion on how gravity drives in UFOs cou...
33,space,As we move into the age of low cost reusable l...,A discussion on the potential of exploring fly...
38,space,"Maybe a dumb question, but this keeps me up at...",If the suns’ gravity pulls the planets in rota...
43,space,I’m back it it with space stuff I need to wrig...,My theories
48,space,It is going to be above me in 20min and I wond...,As the Hubble satellite visible from earth?
51,space,Hi all! \nDon't know if this is the right plac...,What is the best Beginners telescope


### Append datasets

In [36]:
posts = aliens_posts.append(space_posts, ignore_index=True)
posts['subreddit'].unique()

array(['aliens', 'space'], dtype=object)

## 7. Data exploration

In [37]:
import gensim
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from transformers import pipeline

#### Writing a function to look at posts

In [38]:
def print_plot(index):
    example = posts[posts.index == index][['selftext', 'subreddit']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Tag:', example[1])
print_plot(10)

It's brand new, and I've just been populating it with interesting videos that I've seen recently. I'd love for it to become a place for serious discussion and theories as well. 

If this interests any of you please stop by and think about participating!

r/xenogenesis
Tag: aliens


#### Model setup for Naive Bayers

In [39]:
X = posts[['selftext']]
y = posts['subreddit']

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify=y)

In [41]:
# Import CountVectorizer.
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate CountVectorizer.
cvec = CountVectorizer(max_features = 100000, stop_words = 'english')

In [42]:
X_train = X_train['selftext']

In [43]:
X_test = X_test['selftext']

In [44]:
cvec.fit_transform(X_train).todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [45]:
# Fit CountVectorizer on the training data and transform training data.
X_train_cvec = pd.DataFrame(
    cvec.fit_transform(X_train).todense(),
    columns = cvec.get_feature_names()
)

In [46]:
# Transform testing data with the already-fit CountVectorizer.
X_test_cvec = pd.DataFrame(
    cvec.transform(X_test).todense(),
    columns = cvec.get_feature_names()
)

In [47]:
X_train_cvec.head()

Unnamed: 0,00,000,00000000000000000000000000000000000000000000000001,00000000001,000000000091d83a,000000022,00001,000015,00002492,0000566lightspeed,...,𝚁𝚘𝚜𝚠𝚎𝚕𝚕,𝚌𝚘𝚟𝚎𝚛𝚎𝚍,𝚍𝚊𝚢,𝚏𝚕𝚢𝚒𝚗𝚐,𝚒𝚗,𝚘𝚒𝚕,𝚘𝚗,𝚛𝚊𝚒𝚗𝚢,𝟙𝟠,𝟷𝟿𝟺𝟽
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Fit Naive Bayers model

In [48]:
from sklearn.naive_bayes import MultinomialNB

In [49]:
nb = MultinomialNB()

In [50]:
model = nb.fit(X_train_cvec, y_train)

In [51]:
predictions = model.predict(X_test_cvec)

In [52]:
X_train_cvec.shape

(14196, 45892)

In [53]:
y_train.shape

(14196,)

In [54]:
model.score(X_train_cvec, y_train)

0.9175119752042828

In [55]:
model.score(X_test_cvec, y_test)

0.8863300232410734

In [56]:
from sklearn.metrics import confusion_matrix

In [57]:
confusion_matrix(y_test, predictions)

array([[2171,  197],
       [ 341, 2024]], dtype=int64)

In [58]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

In [59]:
print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")

True Negatives: 2171
False Positives: 197
False Negatives: 341
True Positives: 2024


In [60]:
from sklearn.metrics import classification_report

In [61]:
my_tags = ['aliens', 'space']

In [62]:
print(classification_report(y_test, predictions,target_names=my_tags))

              precision    recall  f1-score   support

      aliens       0.86      0.92      0.89      2368
       space       0.91      0.86      0.88      2365

    accuracy                           0.89      4733
   macro avg       0.89      0.89      0.89      4733
weighted avg       0.89      0.89      0.89      4733



# Fit dataset using pipeline & gridsearch

In [63]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

In [64]:
my_tags = ['aliens', 'space']

In [65]:
# TFIDF pipeline setup
tvc_pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

# Random Forest

In [66]:
# Randomforest pipeline setup
rf_pipe = Pipeline([
 ('tvec', TfidfVectorizer()),
 ('rf', RandomForestClassifier())
])

In [67]:
tvc_pipe.fit(X_train, y_train)

Pipeline(steps=[('tvec', TfidfVectorizer()), ('nb', MultinomialNB())])

# Random Forest

In [68]:
rf_pipe.fit(X_train, y_train)

Pipeline(steps=[('tvec', TfidfVectorizer()), ('rf', RandomForestClassifier())])

In [69]:
tf_params = {
    'tvec__stop_words': [None, 'english'],
    'tvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tvec__max_features': [20000, 50000, 100000],
}

In [70]:
rf_params = {
 'tvec__max_features': [2000, 20000, 100000],
 'tvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
 'tvec__stop_words': [None, 'english'],
 'rf__max_depth': [50000, 10000, 200000],
 'rf__min_samples_split': [50, 100, 500],
 'rf__max_leaf_nodes': [None]
}

In [71]:
tvc_gs = GridSearchCV(tvc_pipe, param_grid=tf_params, cv = 5, verbose = 1, n_jobs = -1)

In [72]:
tvc_gs.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'tvec__max_features': [20000, 50000, 100000],
                         'tvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'tvec__stop_words': [None, 'english']},
             verbose=1)

In [73]:
tvc_gs.score(X_train, y_train)

0.9210340941110172

In [74]:
tvc_gs.score(X_test, y_test)

0.8890766955419396

In [75]:
print('best params:', tvc_gs.best_params_)

best params: {'tvec__max_features': 20000, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': 'english'}


In [76]:
print('best score:', tvc_gs.best_score_)

best score: 0.8853186749946669


# Random Forest

In [77]:
rf_gs = GridSearchCV(rf_pipe, param_grid=rf_params, cv = 5, verbose = 1, n_jobs = -1)

In [None]:
rf_gs.fit(X_train, y_train)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


In [None]:
print('best params:', rf_gs.best_params_)

In [None]:
print('best score:', rf_gs.best_score_)

In [None]:
rf_gs.score(X_train, y_train)

In [None]:
rf_gs.score(X_test, y_test)

# y_pred

In [None]:
y_pred = tvc_gs.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred,target_names=my_tags))

# Random Forest

In [None]:
y_pred1 = rf_gs.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred1,target_names=my_tags))