# Project 3: Reddit

### Data Modeling

In [106]:
#import libraries
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier,BaggingClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

**Read in combined Violinist and Cello csv "reddit_data" to begin modeling**

In [43]:
reddit_data = pd.read_csv('../data/reddit_data.csv')

In [44]:
# Check dataframe
reddit_data.head()

Unnamed: 0,author_id,posted_on,author_tag,post_title,post_description,subreddit_name
0,o24eoj,2021-06-17 18:41:15,Advanced,ysaÿe violin concerto e minor first movement l...,[https://www.youtube.com/watch?v=PONbtxcPcKQ](...,1
1,o23xyv,2021-06-17 18:21:58,,video perform exampl great vibrato,I've been struggling with my vibrato and am lo...,1
2,o20e4w,2021-06-17 15:48:10,Adult Beginner,count time 6 8,My current rhythm exercise is in 6/8 time.\n\n...,1
3,o1vxwd,2021-06-17 12:22:01,,interest pick violin,"Hello everyone, as you can tell by the title I...",1
4,o1rmpe,2021-06-17 07:49:56,,question beginn,"When the score says Sul G, Sul D, IVeme chord,...",1


In [45]:
# Pull out rows in the dataframe with null values 
# Since there are no titles or post text for these rows, we will drop them
reddit_data[reddit_data['post_title'].isna()]

Unnamed: 0,author_id,posted_on,author_tag,post_title,post_description,subreddit_name
381,n9jckr,2021-05-10 23:34:01,,,f,1


In [47]:
# Drop all row with 'nan' values
reddit_data.drop(reddit_data.index[381],inplace=True)

In [49]:
#check if row got dropped
reddit_data[reddit_data['post_title'].isna()]

Unnamed: 0,author_id,posted_on,author_tag,post_title,post_description,subreddit_name


**Set X and y**

In [50]:
X = reddit_data['post_title']
y = reddit_data['subreddit_name']

In [51]:
# Check shape of X and y
print(f'Shape of X: {X.shape}')
print(f'Shape of y: {y.shape}')

Shape of X: (2086,)
Shape of y: (2086,)


In [52]:
#Display X first three rows
X.head(3)

0    ysaÿe violin concerto e minor first movement l...
1                   video perform exampl great vibrato
2                                       count time 6 8
Name: post_title, dtype: object

In [53]:
#Display y first three rows
y.head(3)

0    1
1    1
2    1
Name: subreddit_name, dtype: int64

In [54]:
#Display y last three rows
y.tail(3)

2084    0
2085    0
2086    0
Name: subreddit_name, dtype: int64

**Baseline Score**

In [55]:
y.value_counts(normalize=True)

1    0.507191
0    0.492809
Name: subreddit_name, dtype: float64

- Violinist = 1 and Cello = 0
- The baseline score is the score of the majority class. Since the majority class is Violinist, the baseline score is about .507

**Train Test Split**

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=42, 
                                                    stratify=y)# account for slight class unbalance

**CountVectorize the data**

In [57]:
cv = CountVectorizer(min_df=2,                   # Ignore words that do not occur at least 2 times
                    ngram_range=(1,5))          # Set ngram_range

In [58]:
# Fit the CountVectorizer to the training and testing data
cv_train = cv.fit_transform(X_train)            
cv_test = cv.transform(X_test)

In [59]:
# Create cv_train dataframe
cv_train_df = pd.DataFrame(cv_train.toarray(), columns = cv.get_feature_names())
cv_train_df.head()

Unnamed: 0,10,100,11,15,18,18 year,18 year old,1965,1965 sure,1st,...,yamaha,yamaha svc,year,year break,year old,young,youth,youtub,zoom,zoom meetup
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
# Create cv_test dataframe
cv_test_df = pd.DataFrame(cv_test.toarray(), columns = cv.get_feature_names())
cv_test_df.head()

Unnamed: 0,10,100,11,15,18,18 year,18 year old,1965,1965 sure,1st,...,yamaha,yamaha svc,year,year break,year old,young,youth,youtub,zoom,zoom meetup
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [85]:
# Check shape of cv_train and cv_test
print(f'Shape of cv_train: {cv_train.shape}')
print(f'Shape of cv_test: {cv_test.shape}')

Shape of cv_train: (1564, 1332)
Shape of cv_test: (522, 1332)


**Logistic Regression**

In [120]:
# Instantiate model 
lr = LogisticRegression(penalty = 'l1', C =50 ,solver="liblinear")
# Fit model
lr.fit(cv_train, y_train)
# Evaluate model.
print("Logistic Regression".center(25, "="))
print(lr.score(cv_train, y_train))
print(lr.score(cv_test, y_test))
print()

===Logistic Regression===
0.9661125319693095
0.7203065134099617



In [119]:
# Instantiate model 
lr = LogisticRegression(penalty = 'l2', C =50 ,solver="liblinear")
# Fit model
lr.fit(cv_train, y_train)
# Evaluate model.
print("Logistic Regression".center(25, "="))
print(lr.score(cv_train, y_train))
print(lr.score(cv_test, y_test))
print()

===Logistic Regression===
0.9635549872122762
0.7337164750957854



- The Logistic Regression model has a higher training score than the testing score meaning that the model is overfit.

In [75]:
# Created a dataframe of all coefficients and sorted by descending values
# The top features and coefficients that will determine whether a post is a Violin post
coef_df = pd.DataFrame(lr.coef_, columns=cv_train_df.columns).T.sort_values(by=0, ascending=False)

In [93]:
# Top 10 features that determine a Violin post
coef_df.head(10)

Unnamed: 0,0
violin,3.825757
violinist,2.005478
gener,1.312008
keep,1.084604
techniqu,1.060101
becom,0.995871
rest,0.983141
pleas,0.961132
paganini,0.957701
post,0.947658


In [94]:
# Top 10 features that determine an Cello post
coef_df.tail(10)

Unnamed: 0,0
hey guy,-0.927045
piano,-0.966948
major,-0.983423
prelud,-1.007672
postur,-1.041721
posit,-1.272672
zoom,-1.276807
suit,-1.362779
cellist,-2.112829
cello,-4.230653


**Multinomial Naive Bayes Model**

In [97]:
# Instantiate model 
mnb = MultinomialNB()
# Fit model
mnb.fit(cv_train, y_train)
# Evaluate model.
print("MNB".center(18, "="))
print(mnb.score(cv_train, y_train))
print(mnb.score(cv_test, y_test))
print()

0.860613810741688
0.7298850574712644



- Multinomial Naive Bayes Model is also overfit. 
- When compared to Logistic Regression ,this model performs worse on the training data and similarly on the testing data.

**Decision Tree Classififer**

In [103]:
# Instantiate model with random_state = 42.
dt = DecisionTreeClassifier(random_state=42)
# Fit model
dt.fit(cv_train, y_train)
# Evaluate model.
print("Decision Tree".center(18, "="))
print(dt.score(cv_train, y_train))
print(dt.score(cv_test, y_test))
print()

==Decision Tree===
0.9763427109974424
0.7088122605363985



- Decision Tree Classififer is also overfit.
- When compared to Logistic Regression and Multinomial Naive Bayes testing score is lower
- When compared to Logistic Regression and Multinomial Naive Bayes training score is higher

**Bagging Classifier**

In [109]:
# Instantiate model
bag = BaggingClassifier(random_state=42, n_estimators=100)
# Fit model
bag.fit(cv_train, y_train)
# Evaluate model.
print("Bagging Tree".center(18, "="))
print(bag.score(cv_train, y_train))
print(bag.score(cv_test, y_test))
print()

===Bagging Tree===
0.9763427109974424
0.7030651340996169



- Bagging Classifier is also overfit.