# Exploratory Data Analysis

## Imports, Read-in

In [54]:
# Data manip.
import pandas as pd
import numpy as np

# Vizz
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# scikit-learn
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.pipeline import Pipeline

# NLTK
import nltk
from nltk.corpus import stopwords
sw = stopwords.words('english')

# etc.
import sys
sys.path.append( '../src' )
from parse_it import *

In [55]:
df_good = pd.read_csv('../../data/good.csv')
df_promo = pd.read_csv('../../data/promotional.csv')

In [56]:
parse_doc(df_good.iloc[138].text)

NameError: name 'stem' is not defined

In [20]:
df_good.iloc[138].text

"The 8th Military Police Brigade is a military police brigade of the United States Army based at Schofield Barracks, Hawaii. It is responsible for Army military police units within the United States Indo Pacific Command Area of Responsibility. Activated during the Vietnam War, the 8th Military Police Brigade was specifically organized to provide planning, direction, and supervision for the criminal investigation work required by the U.S. Army in Vietnam. It replaced a provisional Military Police Group Criminal Investigation that had been formed on 3 November 1966 in charge of all criminal investigative work in Vietnam, except for the metropolitan Saigon area. In July 1972, it became the basis for the U.S. Army Criminal Investigation Center, Vietnam Field Office. It served in the region for several years and earning ten campaign streamers before being deactivated during the American forces pullout from the region. The brigade was reactivated in Korea in the 1980s to provide command and 

In [21]:
df_promo.iloc[138].text

'Susan Abrams born 1964 is an American business executive who is the Chief Executive Officer of the Illinois Holocaust Museum and Education Center. Abrams was born in New York City and attended the University of Pennsylvania, graduating in 1986 from the Wharton School summa cum laude. Abrams went on to get her Master of Management from Kellogg Graduate School of Management at Northwestern University. Abrams has served on the Chicago Childrens Museum Board of directors as well as the Board of Advisors for the Womens Business Association and Center for Nonprofit Management at Kellogg School of Management. After graduation, Abrams became a financial analyst at Goldman Sachs and, later, a management consultant at McKinsey Company. After McKinsey, Abrams took the position of Vice President of Business, Strategic Planning, Marketing and Communications at the Chicago Childrens Museum. In 1997, Abrams left the Chicago Childrens Museum. Three years later, she published The New Success Rules for

In [22]:
print(df_good.shape)
print(df_promo.shape)

(30279, 2)
(23837, 7)


In [23]:
df_good.head(3)

Unnamed: 0,text,url
0,Nycticebus linglom is a fossil strepsirrhine p...,https://en.wikipedia.org/wiki/%3F%20Nycticebus...
1,Oryzomys pliocaenicus is a fossil rodent from ...,https://en.wikipedia.org/wiki/%3F%20Oryzomys%2...
2,.hack dt hk is a series of single player actio...,https://en.wikipedia.org/wiki/.hack%20%28video...


In [24]:
df_promo.head(3)

Unnamed: 0,text,advert,coi,fanpov,pr,resume,url
0,"1 Litre no Namida 1, lit. 1 Litre of Tears als...",0,0,1,0,0,https://en.wikipedia.org/wiki/1%20Litre%20no%2...
1,"1DayLater was free, web based software that wa...",1,1,0,0,0,https://en.wikipedia.org/wiki/1DayLater
2,1E is a privately owned IT software and servic...,1,0,0,0,0,https://en.wikipedia.org/wiki/1E


In [25]:
df_good.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30279 entries, 0 to 30278
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    30279 non-null  object
 1   url     30279 non-null  object
dtypes: object(2)
memory usage: 473.2+ KB


In [26]:
df_promo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23837 entries, 0 to 23836
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    23837 non-null  object
 1   advert  23837 non-null  int64 
 2   coi     23837 non-null  int64 
 3   fanpov  23837 non-null  int64 
 4   pr      23837 non-null  int64 
 5   resume  23837 non-null  int64 
 6   url     23837 non-null  object
dtypes: int64(5), object(2)
memory usage: 1.3+ MB


In [17]:
for col in df_promo.select_dtypes(include='number'):
    print(f"{df_promo[[col]].value_counts()}\n")

advert
1         18911
0          4926
dtype: int64

coi
0      21695
1       2142
dtype: int64

fanpov
0         22341
1          1496
dtype: int64

pr
0     22321
1      1516
dtype: int64

resume
0         21639
1          2198
dtype: int64



## Set-up for Simple Binary Classification

In [17]:
df_good = df_good[['text']]
df_promo = df_promo[['text']]

In [24]:
df_good['label'] = 0
df_good.head(3)

Unnamed: 0,text,label
0,Nycticebus linglom is a fossil strepsirrhine p...,0
1,Oryzomys pliocaenicus is a fossil rodent from ...,0
2,.hack dt hk is a series of single player actio...,0


In [29]:
df_promo['label'] = 1
df_promo.head(3)

Unnamed: 0,text,label
0,"1 Litre no Namida 1, lit. 1 Litre of Tears als...",1
1,"1DayLater was free, web based software that wa...",1
2,1E is a privately owned IT software and servic...,1


In [30]:
df = df_good.append(other=df_promo,
                    ignore_index=True)

In [31]:
df

Unnamed: 0,text,label
0,Nycticebus linglom is a fossil strepsirrhine p...,0
1,Oryzomys pliocaenicus is a fossil rodent from ...,0
2,.hack dt hk is a series of single player actio...,0
3,The You Drive Me Crazy Tour was the second con...,0
4,0 8 4 is the second episode of the first seaso...,0
...,...,...
54111,ZURICH.MINDS is a non profit foundation set up...,1
54112,"zvelo, Inc. or simply zvelo is a privately hel...",1
54113,Zygote Media Group is a 3D human anatomy conte...,1
54114,Zylom is a distributor of casual games for PC ...,1
