In [2]:
import pandas as pd

# Problem 1: tweet classification - Trudeau vs Trump

In [3]:
# load the data
url = url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/twitter.csv'
data = pd.read_csv(url)
data.head()

Unnamed: 0,timestamp,text,user
0,2020-03-02 23:06:03,"WOW! Thank you, just landed, see everyone soon...",realDonaldTrump
1,2020-03-02 21:47:49,Departing for the Great State of North Carolin...,realDonaldTrump
2,2020-03-02 21:32:54,They are staging a coup against Bernie!,realDonaldTrump
3,2020-03-02 19:55:40,THANK YOU!https://www.breitbart.com/tech/2020/...,realDonaldTrump
4,2020-03-02 19:55:07,Michelle @FischbachMN7 is running for Congress...,realDonaldTrump


This is a corpus of tweets from Donald Trump and Justin Trudeau. 
The **goal** is to build a classification pipeline that predicts the author (Trump or Trudeau) of a tweet based on the text.

**Part 1:** Define the feature matrix X and the target vector y from the dataframe, and then split X and y into training and testing sets.

In [6]:
X = data.text
y = data.user

In [7]:
# split data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)

**Part 2:** build a classification pipeline (count vectorizer + Naive Bayes model), and fit the pipeline to the training data.

In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

pipe = Pipeline(steps=[
    ('vect', CountVectorizer(max_features=1000)),
    ('clf', MultinomialNB()) # can take alpha parameter
])

# fit pipe
pipe.fit(X_train,y_train)


**Part 3:** Evaluate the performance of your classification pipeline on the test set

In [10]:
# evaluate the pipeline
y_test_pred = pipe.predict(X_test)

In [11]:
confusion_matrix(y_test,y_test_pred)

array([[ 83,   2],
       [  6, 174]], dtype=int64)

**Part 4:** What words does the model use to choose between Trump or Trudeau

In [13]:
# get classes
pipe['clf'].classes_

array(['JustinTrudeau', 'realDonaldTrump'], dtype='<U15')

0: Justin
1: Trump

In [21]:
# store vocabulary
words = pipe['vect'].get_feature_names_out()

# get counts
justin = pipe['clf'].feature_count_[0]
trump = pipe['clf'].feature_count_[1]

df = pd.DataFrame({'words':words, 'justin':justin, 'trump':trump}).set_index('words')
# add 1 to the columns to avoid dividing by 0
df.justin = df.justin+1
df.trump = df.trump+1

df.tail(5)

Unnamed: 0_level_0,justin,trump
words,Unnamed: 1_level_1,Unnamed: 2_level_1
york,1.0,20.0
you,322.0,139.0
young,9.0,2.0
your,119.0,31.0
yourself,8.0,1.0


In [22]:
# convert counts into frequencies
df.justin = df.justin/df.trump.sum()
df.trump = df.trump/df.trump.sum()

# calculate ratio of ham to spam and spam to ham for each word
df['justin_ratio'] = df.justin/df.trump
df['trump_ratio'] = df.trump/df.justin

In [26]:
# top 10 justin words
df.sort_values(by='justin_ratio', ascending=False).head(10)

Unnamed: 0_level_0,justin,trump,justin_ratio,trump_ratio
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ca,0.007702,8.1e-05,95.0,0.010526
pm,0.007216,8.1e-05,89.0,0.011236
en,0.007135,8.1e-05,88.0,0.011364
gc,0.004702,8.1e-05,58.0,0.017241
ll,0.00454,8.1e-05,56.0,0.017857
update,0.00454,8.1e-05,56.0,0.017857
canadians,0.003892,8.1e-05,48.0,0.020833
trudeau,0.003811,8.1e-05,47.0,0.021277
re,0.016783,0.000486,34.5,0.028986
direct,0.002676,8.1e-05,33.0,0.030303


In [25]:
# top 10 trump words
df.sort_values(by='trump_ratio', ascending=False).head(10)

Unnamed: 0_level_0,justin,trump,justin_ratio,trump_ratio
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fake,8.1e-05,0.004702,0.017241,58.0
conference,8.1e-05,0.002351,0.034483,29.0
media,8.1e-05,0.002351,0.034483,29.0
white,8.1e-05,0.00227,0.035714,28.0
state,8.1e-05,0.002108,0.038462,26.0
democrats,8.1e-05,0.001703,0.047619,21.0
endorsement,8.1e-05,0.001703,0.047619,21.0
china,8.1e-05,0.001703,0.047619,21.0
foxnews,8.1e-05,0.001703,0.047619,21.0
eastern,8.1e-05,0.001622,0.05,20.0


**Bonus:** can you write a Trump or Trudeau tweet?