# Thinkful Unit 4- Unsupervised Learning Capstone

For my unsupervised learning capstone I took tweets from major news organizations to see how similar, or dissimilar, they might be. In this, I will clean the data, tokenize it, prepare it with Bag of Words and TF-IDF. Then, I will use unsupervised methods like KMeans Clustering, and Latent Semantic Analysis (LSA) to analyze the data, before using classifiers to determine how alike the tweets are, and if the news agencies are actually reporting similar information, and if the classifiers can accurately predict the source of the tweet.

In [34]:
#initial imports
import tweepy
import numpy as np
import pandas as pd
import json
from collections import Counter
import re
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#SpaCy
import spacy
#sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import cross_val_score
from sklearn import ensemble
from sklearn.decomposition import PCA
from sklearn.cluster import MeanShift
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
#BeautifulSoup and NLTK
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords # Import the stop word list

datatf




#### Let's double check everything went through properly.

In [42]:
user = api.me()
print (user.name)

datatf


#### Yep! Good to go. Let's pick the Twitter handles we'll analyze.

In [43]:
handles = ['CNN','FoxNews','NPR','Reuters','WSJ','NYTimes', 'USATODAY', 'washingtonpost', 'BBCWorld']

#### We'll make some JSON files to store the tweets in to create a dataframe. 

In [44]:
#Open a blank .json file for each author, will append tweets later
for handle in handles:
    with open(f'/Users/tiffanyfrench/Desktop/Capstone/{handle}2.json', 'w') as f:
        try: 
            json.load(f)
        except ValueError:
            empty = [] #Just added this line to handle exceptions, does nothing

In [45]:
#Twitter's API only allows for 200 records per call
TWEETS_PER_CALL = 200 

def save_tweets(screen_nm, tweets):
    # 'a' appends to the .json we wrote with 'w' above
    with open(f'/Users/tiffanyfrench/Desktop/Capstone/{screen_nm}2.json', 'a') as f:
        for tweet in tweets:
            json.dump([tweet.text, screen_nm], f); f.write('\n')

def get_tweets(screen_nm, desired_ct):
    
    #desired_ct is the number of tweets the user wants to include
    tweet_list = []

    #grab the current maximum tweet id for provided screen name, which will be the id of the first tweet stored in the object
    curr_max_id = api.user_timeline(screen_name = screen_nm)[0].id 
    
    #may have to do more than 1 call to the API (if user wants > TWEETS_PER_CALL tweets)
    remaining = desired_ct
        
    while remaining > 0:
        tweets = api.user_timeline(screen_name = screen_nm, count = min(remaining, TWEETS_PER_CALL), max_id = curr_max_id, include_rts=False)
        
        #save these tweets in a json file for later, in case kernel crashes or want to reference at a later date
        save_tweets(screen_nm, tweets)
        
        for tweet in tweets:
            tweet_list.append([tweet.text, screen_nm])
                
        remaining = remaining - min(remaining, TWEETS_PER_CALL)
        
        #re-set the maximum id for every TWEETS_PER_CALL tweets, so that we don't include duplicate tweets
        curr_max_id = tweets[-1].id 

    return tweet_list

In [46]:
#Create tweet lists
cnn = get_tweets('CNN', 3200)
fox = get_tweets('FoxNews', 3200)
reuters = get_tweets('Reuters', 3200)
npr = get_tweets('NPR', 3200)
nyt = get_tweets('NYTimes', 3200)
wsj = get_tweets('WSJ', 3200)
usa = get_tweets('USATODAY', 3200)
wapo = get_tweets('washingtonpost', 3200)
bbc = get_tweets('BBCWorld', 3200)

In [47]:
#Combine all tweets into 1 list

full_tweets = []
for handle in handles:
    with open('/Users/tiffanyfrench/Desktop/Capstone/{}2.json'.format(handle)) as f: 
        full_tweets.append([json.loads(line) for line in f])
        

all_tweets = []
for auth in full_tweets:
    all_tweets.extend(auth[:2500])
    
#Store in dataframe
df = pd.DataFrame(all_tweets, columns = ['Tweets', 'Org'])

In [48]:
df.tail(10)

Unnamed: 0,Tweets,Org
22490,Seven killed in raging fire along South Africa...,BBCWorld
22491,Pittsburgh shooting: Anger at Pence rally 'Jes...,BBCWorld
22492,Russian aircraft carrier Admiral Kuznetsov dam...,BBCWorld
22493,"""Until Saudi Arabia officially announced his d...",BBCWorld
22494,India journalist and police killed in 'Maoist ...,BBCWorld
22495,Filipina women detained at Halloween party in ...,BBCWorld
22496,Venice under water as Italy hit by fierce wind...,BBCWorld
22497,North and South Korea: The football game that ...,BBCWorld
22498,Julian Assange: Ecuador court rejects lawsuit ...,BBCWorld
22499,Geoffrey Rush: Actress 'frightened' by alleged...,BBCWorld


In [49]:
#Checked to see how many tweets were from each organization. I dropped the AP because they had too few after RT's were 
#removed. Now, we'll just see the first 2,500 from each org, because they all had at least that many.
df.groupby(['Org']).count()

Unnamed: 0_level_0,Tweets
Org,Unnamed: 1_level_1
BBCWorld,2500
CNN,2500
FoxNews,2500
NPR,2500
NYTimes,2500
Reuters,2500
USATODAY,2500
WSJ,2500
washingtonpost,2500


### Alright! We have our tweets. It looks like there are about 22,500 for us to train on. This should be a great start. First, let's clean and tokenize to prep for unsupervised learning.

In [17]:
#tweets_full = ' '.join(df['Tweets'])
#len(tweets_full)
#type(tweets_full)

In [18]:
#df.Tweets.apply(lambda x: x.replace("r'/n'", ''))
#df.Tweets.apply(lambda x: x.replace("r'\\n'", ''))
#df.Tweets.apply(lambda x: x.replace("r'\n'", ''))
#df.Tweets.apply(lambda x: x.replace("r'--'", ''))
#df.Tweets.apply(lambda x: x.replace("r'-'", ''))
#df.Tweets.apply(lambda x: x.replace('https:', ''))
#df.Tweets.apply(lambda x: x.replace("r'amp'", ''))
#df.Tweets.apply(lambda x: x.replace("r'&'", ''))
#df.Tweets.apply(lambda x: x.replace("r'w/'", ''))
#df.Tweets.apply(lambda x: x.replace("'s", ''))
#df.Tweets.apply(lambda x: x.replace('"', ''))
#df.Tweets.apply(lambda x: x.replace('#', ''))
#df.Tweets.apply(lambda x: x.replace('https://', ''))
#df.Tweets.apply(lambda x: x.replace('t.co', ''))

In [19]:
#df['Tweets'] = df['Tweets'].str.lower()

In [20]:
#df['Org'] = df['Org'].str.lower()

In [50]:
df.head(10)

Unnamed: 0,Tweets,Org
0,Republicans are divided on whether Trump shoul...,CNN
1,Why do Americans love #denim? See where the lo...,CNN
2,Former Texas Rep. Beto O'Rourke will be interv...,CNN
3,Mexico's gas shortage fuels long lines at the ...,CNN
4,National park visitors cut down protected Josh...,CNN
5,The current government shutdown has resulted f...,CNN
6,Sun-lovers and fans of Mickey and Minnie flock...,CNN
7,Investors thought Edible Arrangements would fa...,CNN
8,"""Oh my God. Oh my God. I'm shaking."" This Milw...",CNN
9,A team of scientists discovered a series of re...,CNN


### Prep for CountVectorizer

#### Gets rid of numbers, urls and turns into lowercase

In [51]:
tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))

def clean_tweet(raw_tweet):
    # 1. Remove HTML
    soup = BeautifulSoup(raw_tweet)
    souped = soup.get_text()
    #
    # 2. Remove non-letters        
    #letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    #
    # 3. Convert to lower case, split into individual words
    lower_case = letters_only.lower()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    #meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    #return( " ".join( meaningful_words ))   
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()

In [52]:
for i in df:
    for i in range(len(df)):
        df['Tweets'][i] = clean_tweet(df['Tweets'][i])

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [53]:
df.head(10)

Unnamed: 0,Tweets,Org
0,republicans are divided on whether trump shoul...,CNN
1,why do americans love denim see where the love...,CNN
2,former texas rep beto o rourke will be intervi...,CNN
3,mexico s gas shortage fuels long lines at the ...,CNN
4,national park visitors cut down protected josh...,CNN
5,the current government shutdown has resulted f...,CNN
6,sun lovers and fans of mickey and minnie flock...,CNN
7,investors thought edible arrangements would fa...,CNN
8,oh my god oh my god i m shaking this milwaukee...,CNN
9,a team of scientists discovered a series of re...,CNN


In [54]:
type(df)

pandas.core.frame.DataFrame

### Splitting this up for later classifiers

In [55]:
y = df.Org
X = df.Tweets

In [56]:
y_train, y_test = train_test_split(df.Org, test_size=0.25, random_state=42)

### BoW with SKLearn CountVectorizer

In [57]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words="english", analyzer='word', 
                            ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)
bow = vectorizer.fit_transform(X)
print(vectorizer.get_feature_names())



In [58]:
bow.shape

(22500, 21583)

### Train and Test Sets

In [59]:
X_train, X_test = train_test_split(bow, test_size=0.25, random_state=42)

In [60]:
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(bow)
tfidf.shape

(22500, 21583)

In [61]:
X_train_tfidf, X_test_tfidf = train_test_split(tfidf, test_size=0.25, random_state=42)

### Mean Shift with Bag of Words

In [33]:
from sklearn.cluster import MeanShift, estimate_bandwidth

# Here we set the bandwidth. This function automatically derives a bandwidth
# number based on an inspection of the distances among points in the data.
bandwidth = estimate_bandwidth(X_train.todense(), quantile=0.2, n_samples=500)

# Declare and fit the model.
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X_train.todense())

# Extract cluster assignments for each data point.
labels = ms.labels_

# Coordinates of the cluster centers.
cluster_centers = ms.cluster_centers_

# Count our clusters.
n_clusters_ = len(np.unique(labels))

print("Number of estimated clusters: {}".format(n_clusters_))

KeyboardInterrupt: 

### Mean Shift with TF-IDF

In [None]:
from sklearn.cluster import MeanShift, estimate_bandwidth

# Here we set the bandwidth. This function automatically derives a bandwidth
# number based on an inspection of the distances among points in the data.
bandwidth = estimate_bandwidth(X_train_tfidf.todense(), quantile=0.2, n_samples=500)

# Declare and fit the model.
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X_train_tfidf.todense())

# Extract cluster assignments for each data point.
labels = ms.labels_

# Coordinates of the cluster centers.
cluster_centers = ms.cluster_centers_

# Count our clusters.
n_clusters_ = len(np.unique(labels))

print("Number of estimated clusters: {}".format(n_clusters_))

It looks like Bag of Words might be doing a better job of distinguishing between the tweets. We have 9 accounts, and 8 clusters. That seems pretty good!

### K Means with Bag of Words

In [None]:
# Normalize the data.
X_norm = normalize(X_train)

In [None]:
# Reduce it to two components.
X_pca_bow = PCA(2).fit_transform(X_norm.todense())

# Calculate predicted values.
y_pred = KMeans(n_clusters=9, random_state=42).fit_predict(X_pca_bow)

# Plot the solution.
plt.scatter(X_pca_bow[:, 0], X_pca_bow[:, 1], c=y_pred)
plt.show()

### K Means Clustering with TF-IDF

In [None]:
#X_norm.todense()

In [None]:
# Normalize the data.
X_normt = normalize(X_train_tfidf)
# Reduce it to two components.
X_pcat = PCA(2).fit_transform(X_normt.todense())

# Calculate predicted values.
y_pred = KMeans(n_clusters=2, random_state=42).fit_predict(X_pcat)

# Plot the solution.
plt.scatter(X_pcat[:, 0], X_pcat[:, 1], c=y_pred)
plt.show()

### LSA

In [62]:
print("Number of features: %d" % X_train.get_shape()[1])

Number of features: 21583


In [63]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(2100)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(5):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])

Percent variance captured by all components: 63.98211039181273


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all().

Let's see how many features we had in our TF-IDF training set.

In [None]:
print("Number of features: %d" % X_train_tfidf.get_shape()[1])

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space to about 10% of the original.
svd= TruncatedSVD(2100)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
tweets_by_component=pd.DataFrame(X_train_lsa, index=X_train)
for i in range(5):
    print('Component {}:'.format(i))
    print(tweets_by_component.loc[:,i].sort_values(ascending=False)[0:10])