# Modeling Exercises

Do your work for this exercise in a file named `model.ipynb`.

Take the work we did in the lessons further:

1. What other types of models (i.e. different classifcation algorithms) could you use? Create a model with a different algorithm.
2. How do the models compare when trained on term frequency data alone, instead of TF-IDF values?

In [93]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from prepare import basic_clean, lemmatize
import acquire

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree


from sklearn.metrics import classification_report, accuracy_score
from env import user, password, host

Classification algorithms to use aside from Logistic Regression could be Random Forest or Decision Tree

## Continued modeling from curriculum

CURRICULUM MODEL

In [98]:
def get_db_url(database, host=host, user=user, password=password):
    return f'mysql+pymysql://{user}:{password}@{host}/{database}'

url = get_db_url("spam_db")
sql = "SELECT * FROM spam"

df = pd.read_sql(sql, url, index_col="id")
df.head()

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.text)
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual_lm=y_train))
test = pd.DataFrame(dict(actual_lm=y_test))

lm = LogisticRegression().fit(X_train, y_train)

train['lm_predicted'] = lm.predict(X_train)
test['lm_predicted'] = lm.predict(X_test)

CURRICULUM RESULTS

In [None]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 91.25%
---
Confusion Matrix
actual         business  entertainment  sports  technology
predicted                                                 
business             15              0       0           2
entertainment         0             20       0           0
sports                0              0      20           0
technology            5              0       0          18
---
               precision    recall  f1-score   support

     business       0.88      0.75      0.81        20
entertainment       1.00      1.00      1.00        20
       sports       1.00      1.00      1.00        20
   technology       0.78      0.90      0.84        20

     accuracy                           0.91        80
    macro avg       0.92      0.91      0.91        80
 weighted avg       0.92      0.91      0.91        80



In [None]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 50.00%
---
Confusion Matrix
actual         business  entertainment  sports  technology
predicted                                                 
business              2              1       0           3
entertainment         2              3       0           0
sports                1              1       5           2
---
               precision    recall  f1-score   support

     business       0.33      0.40      0.36         5
entertainment       0.60      0.60      0.60         5
       sports       0.56      1.00      0.71         5
   technology       0.00      0.00      0.00         5

     accuracy                           0.50        20
    macro avg       0.37      0.50      0.42        20
 weighted avg       0.37      0.50      0.42        20



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


FURTHER MODELING

In [95]:
train

Unnamed: 0_level_0,actual,predicted
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3419,spam,spam
76,ham,ham
2065,ham,ham
4149,ham,ham
4841,ham,ham
...,...,...
563,spam,spam
1418,ham,ham
1051,ham,ham
4534,ham,ham


In [97]:
# create Decision Tree model
clf = DecisionTreeClassifier(max_depth=3, random_state = 222)

# fit the model
clf = clf.fit(X_train, y_train)


## Model to predict category of news article

In [74]:
df = acquire.acquire_news()

In [75]:
articles = [lemmatize(basic_clean(article)) for article in df.content]

In [76]:
def idf(word):
    '''A simple way to calculate idf for demonstration. Note that this 
    function relies on a globally defined blogs variable.'''
    n_occurences = sum([1 for doc in articles if word in doc])
    return len(articles) / n_occurences

In [77]:
unique_words = pd.Series(' '.join(articles).split()).unique()
unique_words

array(['rbi', 'ha', 'cancelled', ..., 'increase', 'rearend', 'collision'],
      dtype=object)

In [78]:
# put the unique words into a data frame
(pd.DataFrame(dict(word=unique_words))
 # calculate the idf for each word
 .assign(idf=lambda df: df.word.apply(idf))
 # sort the data for presentation purposes
 .set_index('word')
 .sort_values(by='idf', ascending=False))

Unnamed: 0_level_0,idf
word,Unnamed: 1_level_1
rbi,100.0
chennai,100.0
sukesh,100.0
conman,100.0
saini,100.0
...,...
he,1.0
u,1.0
in,1.0
a,1.0


In [80]:
tfidf = TfidfVectorizer()
tfidfs = tfidf.fit_transform(articles)
tfidfs

<100x1919 sparse matrix of type '<class 'numpy.float64'>'
	with 4555 stored elements in Compressed Sparse Row format>

In [81]:
pd.DataFrame(tfidfs.todense(), columns=tfidf.get_feature_names())

Unnamed: 0,10,100,1000,107,10th,11,11th,12,120000,1206,...,yuvraj,yuzvendra,zalmi,zaman,zapkeycom,zero,zipping,zone,zucker,zuckerberg
0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
1,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.156727,0.0,0.156727,0.0,0.000000
2,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
3,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
4,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
96,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
97,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.120231
98,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000


In [82]:
df.head()

Unnamed: 0,title,author,content,category
0,RBI cancels licence of Maha-based Independence...,Shalini Ojha,RBI has cancelled licence of Maharashtra-based...,business
1,Boost to EVs a big step: Windmill Capital,Roshan Gupta,"Increased use of EVs in public transport, spec...",business
2,Facebook parent Meta's $230-billion wipeout bi...,Pragya Swastik,Facebook's parent Meta's shares plunged 27% an...,business
3,Facebook's daily active users fall for first t...,Pragya Swastik,Facebook has seen its daily active users (DAUs...,business
4,"Tesla co-worker used N-word, threw a hot tool ...",Kiran Khatri,A former Tesla worker has filed a lawsuit agai...,business


In [85]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.content)
y = df.category

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

lm = LogisticRegression().fit(X_train, y_train)

train['predicted'] = lm.predict(X_train)
test['predicted'] = lm.predict(X_test)

In [86]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 91.25%
---
Confusion Matrix
actual         business  entertainment  sports  technology
predicted                                                 
business             15              0       0           2
entertainment         0             20       0           0
sports                0              0      20           0
technology            5              0       0          18
---
               precision    recall  f1-score   support

     business       0.88      0.75      0.81        20
entertainment       1.00      1.00      1.00        20
       sports       1.00      1.00      1.00        20
   technology       0.78      0.90      0.84        20

     accuracy                           0.91        80
    macro avg       0.92      0.91      0.91        80
 weighted avg       0.92      0.91      0.91        80



In [87]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 50.00%
---
Confusion Matrix
actual         business  entertainment  sports  technology
predicted                                                 
business              2              1       0           3
entertainment         2              3       0           0
sports                1              1       5           2
---
               precision    recall  f1-score   support

     business       0.33      0.40      0.36         5
entertainment       0.60      0.60      0.60         5
       sports       0.56      1.00      0.71         5
   technology       0.00      0.00      0.00         5

     accuracy                           0.50        20
    macro avg       0.37      0.50      0.42        20
 weighted avg       0.37      0.50      0.42        20



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
