# Thinkful Unit 4- Unsupervised Learning Capstone

For my unsupervised learning capstone I took tweets from major news organizations to see how similar, or dissimilar, they might be. In this, I will clean the data, tokenize it, prepare it with Bag of Words and TF-IDF. Then, I will use unsupervised methods like KMeans Clustering, and Latent Semantic Analysis (LSA) to analyze the data, before using classifiers to determine how alike the tweets are, and if the news agencies are actually reporting similar information, and if the classifiers can accurately predict the source of the tweet.

In [111]:
#initial imports
import tweepy
import numpy as np
import pandas as pd
import json
from collections import Counter
import re
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#SpaCy
import spacy
#sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import cross_val_score
from sklearn import ensemble
from sklearn.decomposition import PCA
from sklearn.cluster import MeanShift
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import neural_network
from sklearn.model_selection import GridSearchCV
from sklearn import ensemble
from sklearn.decomposition import PCA
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
#BeautifulSoup and NLTK
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords # Import the stop word list

datatf




#### Let's double check everything went through properly.

In [42]:
user = api.me()
print (user.name)

datatf


#### Yep! Good to go. Let's pick the Twitter handles we'll analyze.

In [43]:
handles = ['CNN','FoxNews','NPR','Reuters','WSJ','NYTimes', 'USATODAY', 'washingtonpost', 'BBCWorld']

#### We'll make some JSON files to store the tweets in to create a dataframe. 

In [44]:
#Open a blank .json file for each author, will append tweets later
for handle in handles:
    with open(f'/Users/tiffanyfrench/Desktop/Capstone/{handle}2.json', 'w') as f:
        try: 
            json.load(f)
        except ValueError:
            empty = [] #Just added this line to handle exceptions, does nothing

In [45]:
#Twitter's API only allows for 200 records per call
TWEETS_PER_CALL = 200 

def save_tweets(screen_nm, tweets):
    # 'a' appends to the .json we wrote with 'w' above
    with open(f'/Users/tiffanyfrench/Desktop/Capstone/{screen_nm}2.json', 'a') as f:
        for tweet in tweets:
            json.dump([tweet.text, screen_nm], f); f.write('\n')

def get_tweets(screen_nm, desired_ct):
    
    #desired_ct is the number of tweets the user wants to include
    tweet_list = []

    #grab the current maximum tweet id for provided screen name, which will be the id of the first tweet stored in the object
    curr_max_id = api.user_timeline(screen_name = screen_nm)[0].id 
    
    #may have to do more than 1 call to the API (if user wants > TWEETS_PER_CALL tweets)
    remaining = desired_ct
        
    while remaining > 0:
        tweets = api.user_timeline(screen_name = screen_nm, count = min(remaining, TWEETS_PER_CALL), max_id = curr_max_id, include_rts=False)
        
        #save these tweets in a json file for later, in case kernel crashes or want to reference at a later date
        save_tweets(screen_nm, tweets)
        
        for tweet in tweets:
            tweet_list.append([tweet.text, screen_nm])
                
        remaining = remaining - min(remaining, TWEETS_PER_CALL)
        
        #re-set the maximum id for every TWEETS_PER_CALL tweets, so that we don't include duplicate tweets
        curr_max_id = tweets[-1].id 

    return tweet_list

In [46]:
#Create tweet lists
cnn = get_tweets('CNN', 3200)
fox = get_tweets('FoxNews', 3200)
reuters = get_tweets('Reuters', 3200)
npr = get_tweets('NPR', 3200)
nyt = get_tweets('NYTimes', 3200)
wsj = get_tweets('WSJ', 3200)
usa = get_tweets('USATODAY', 3200)
wapo = get_tweets('washingtonpost', 3200)
bbc = get_tweets('BBCWorld', 3200)

In [47]:
#Combine all tweets into 1 list

full_tweets = []
for handle in handles:
    with open('/Users/tiffanyfrench/Desktop/Capstone/{}2.json'.format(handle)) as f: 
        full_tweets.append([json.loads(line) for line in f])
        

all_tweets = []
for auth in full_tweets:
    all_tweets.extend(auth[:2500])
    
#Store in dataframe
df = pd.DataFrame(all_tweets, columns = ['Tweets', 'Org'])

In [48]:
df.tail(10)

Unnamed: 0,Tweets,Org
22490,Seven killed in raging fire along South Africa...,BBCWorld
22491,Pittsburgh shooting: Anger at Pence rally 'Jes...,BBCWorld
22492,Russian aircraft carrier Admiral Kuznetsov dam...,BBCWorld
22493,"""Until Saudi Arabia officially announced his d...",BBCWorld
22494,India journalist and police killed in 'Maoist ...,BBCWorld
22495,Filipina women detained at Halloween party in ...,BBCWorld
22496,Venice under water as Italy hit by fierce wind...,BBCWorld
22497,North and South Korea: The football game that ...,BBCWorld
22498,Julian Assange: Ecuador court rejects lawsuit ...,BBCWorld
22499,Geoffrey Rush: Actress 'frightened' by alleged...,BBCWorld


In [49]:
#Checked to see how many tweets were from each organization. I dropped the AP because they had too few after RT's were 
#removed. Now, we'll just see the first 2,500 from each org, because they all had at least that many.
df.groupby(['Org']).count()

Unnamed: 0_level_0,Tweets
Org,Unnamed: 1_level_1
BBCWorld,2500
CNN,2500
FoxNews,2500
NPR,2500
NYTimes,2500
Reuters,2500
USATODAY,2500
WSJ,2500
washingtonpost,2500


### Alright! We have our tweets. It looks like there are about 22,500 for us to train on. This should be a great start. First, let's clean and tokenize to prep for unsupervised learning.

In [50]:
df.head(10)

Unnamed: 0,Tweets,Org
0,Republicans are divided on whether Trump shoul...,CNN
1,Why do Americans love #denim? See where the lo...,CNN
2,Former Texas Rep. Beto O'Rourke will be interv...,CNN
3,Mexico's gas shortage fuels long lines at the ...,CNN
4,National park visitors cut down protected Josh...,CNN
5,The current government shutdown has resulted f...,CNN
6,Sun-lovers and fans of Mickey and Minnie flock...,CNN
7,Investors thought Edible Arrangements would fa...,CNN
8,"""Oh my God. Oh my God. I'm shaking."" This Milw...",CNN
9,A team of scientists discovered a series of re...,CNN


### Prep for CountVectorizer

#### Gets rid of numbers, urls and turns into lowercase

In [51]:
tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))

def clean_tweet(raw_tweet):
    # 1. Remove HTML
    soup = BeautifulSoup(raw_tweet)
    souped = soup.get_text()
    #
    # 2. Remove non-letters        
    #letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    #
    # 3. Convert to lower case, split into individual words
    lower_case = letters_only.lower()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    #meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    #return( " ".join( meaningful_words ))   
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()

In [52]:
for i in df:
    for i in range(len(df)):
        df['Tweets'][i] = clean_tweet(df['Tweets'][i])

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [53]:
df.head(10)

Unnamed: 0,Tweets,Org
0,republicans are divided on whether trump shoul...,CNN
1,why do americans love denim see where the love...,CNN
2,former texas rep beto o rourke will be intervi...,CNN
3,mexico s gas shortage fuels long lines at the ...,CNN
4,national park visitors cut down protected josh...,CNN
5,the current government shutdown has resulted f...,CNN
6,sun lovers and fans of mickey and minnie flock...,CNN
7,investors thought edible arrangements would fa...,CNN
8,oh my god oh my god i m shaking this milwaukee...,CNN
9,a team of scientists discovered a series of re...,CNN


In [54]:
type(df)

pandas.core.frame.DataFrame

### Splitting this up for later classifiers

In [55]:
y = df.Org
X = df.Tweets

In [56]:
y_train, y_test = train_test_split(df.Org, test_size=0.25, random_state=42)

In [72]:
#X_train, X_test = train_test_split(X, test_size=0.25, random_state=42)

## Bag of Words Analysis

### BoW with SKLearn CountVectorizer

In [73]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words="english", analyzer='word', 
                            ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)
bow = vectorizer.fit_transform(X_train)
print(vectorizer.get_feature_names())



In [59]:
X_train, X_test = train_test_split(bow, test_size=0.25, random_state=42)

In [74]:
bow.shape

(16875, 19272)

### Mean Shift with Bag of Words

In [33]:
from sklearn.cluster import MeanShift, estimate_bandwidth

# Here we set the bandwidth. This function automatically derives a bandwidth
# number based on an inspection of the distances among points in the data.
bandwidth = estimate_bandwidth(X_train.todense(), quantile=0.2, n_samples=500)

# Declare and fit the model.
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X_train.todense())

# Extract cluster assignments for each data point.
labels = ms.labels_

# Coordinates of the cluster centers.
cluster_centers = ms.cluster_centers_

# Count our clusters.
n_clusters_ = len(np.unique(labels))

print("Number of estimated clusters: {}".format(n_clusters_))

KeyboardInterrupt: 

### K Means with Bag of Words

In [None]:
# Normalize the data.
X_norm = normalize(X_train)

In [None]:
# Reduce it to two components.
X_pca_bow = PCA(2).fit_transform(X_norm.todense())

# Calculate predicted values.
y_pred = KMeans(n_clusters=9, random_state=42).fit_predict(X_pca_bow)

# Plot the solution.
plt.scatter(X_pca_bow[:, 0], X_pca_bow[:, 1], c=y_pred)
plt.show()

### LSA with BoW

In [79]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

from sklearn.decomposition import TruncatedSVD
lsa = TruncatedSVD(n_components=5,n_iter=500)
lsa.fit(bow)
terms = vectorizer.get_feature_names()

for i,comp in enumerate(lsa.components_):
    termsInComp = zip(terms,comp)
    sortedterms = sorted(termsInComp, key=lambda x: x[1],reverse=True)[:10]
    print("Concept %d:" % i)
    for term in sortedterms:
        print(term[0])
    print(" ")

Concept 0:
trump
president
new
border
wall
says
year
house
shutdown
government
 
Concept 1:
new
year
york
old
city
eve
people
world
man
day
 
Concept 2:
year
old
christmas
man
girl
boy
best
border
death
woman
 
Concept 3:
government
shutdown
says
house
federal
partial
democrats
senate
border
workers
 
Concept 4:
says
china
president
house
just
people
white
north
man
leader
 


### Bigrams

In [91]:
handles = ['CNN','FoxNews','NPR','Reuters','WSJ','NYTimes', 'USATODAY', 'washingtonpost', 'BBCWorld']

In [86]:
count_vect_bigram = CountVectorizer(ngram_range=(1,2))
train_data_bow_bigram = count_vect_bigram.fit_transform(X_train)
test_data_bow_bigram = count_vect_bigram.transform(X_test)

In [None]:
#Confusion matrix for Bigram BOW with Logistic Regression

lr = LogisticRegression()
lr.fit(train_data_bow_bigram,y_train)

print('Training set score:', lr.score(train_data_bow_bigram, y_train))
print('\nTest set score:', lr.score(test_data_bow_bigram, y_test))

y_pred = lr.predict(test_data_bow_bigram)

conf = confusion_matrix(y_test, y_pred, labels = handles)

cm = pd.DataFrame(conf, index = [i for i in handles],
                  columns = [i for i in handles])
plt.figure(figsize = (15,7))
sns.heatmap(cm, annot=True, cmap="YlOrRd", fmt = 'd')
crs = cross_val_score(lr, train_data_bow_bigram, y_train, cv=10)
print("Cross-Valid Bigram Fold Results are: ",crs)
print("Mean of Folds are = ",crs.mean())

## TF-IDF Analysis

### Train and Test Sets

In [75]:
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(bow)
tfidf.shape

(16875, 19272)

In [122]:
X_train_tfidf, X_test_tfidf = train_test_split(tfidf, test_size=0.25, random_state=42)

### Mean Shift with TF-IDF

In [None]:
from sklearn.cluster import MeanShift, estimate_bandwidth

# Here we set the bandwidth. This function automatically derives a bandwidth
# number based on an inspection of the distances among points in the data.
bandwidth = estimate_bandwidth(tfidf.todense(), quantile=0.2, n_samples=500)

# Declare and fit the model.
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X_train_tfidf.todense())

# Extract cluster assignments for each data point.
labels = ms.labels_

# Coordinates of the cluster centers.
cluster_centers = ms.cluster_centers_

# Count our clusters.
n_clusters_ = len(np.unique(labels))

print("Number of estimated clusters: {}".format(n_clusters_))

It looks like Bag of Words might be doing a better job of distinguishing between the tweets. We have 9 accounts, and 8 clusters. That seems pretty good!

### K Means Clustering with TF-IDF

In [None]:
#X_norm.todense()

In [None]:
# Normalize the data.
X_normt = normalize(tfidf)
# Reduce it to two components.
X_pcat = PCA(2).fit_transform(X_normt.todense())

# Calculate predicted values.
y_pred = KMeans(n_clusters=2, random_state=42).fit_predict(X_pcat)

# Plot the solution.
plt.scatter(X_pcat[:, 0], X_pcat[:, 1], c=y_pred)
plt.show()

In [103]:
rfc = ensemble.RandomForestClassifier()

vectorizer1 = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the tweets
                             min_df=2, # only use words that appear at least 2x
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case 
                             use_idf=True,
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter tweets get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

X_train_tfidf = vectorizer1.fit_transform(X_train)
X_test_tfidf = vectorizer1.transform(X_test)

fit = rfc.fit(X_train_tfidf, y_train)
y_pred = rfc.predict(X_test_tfidf)

print('Training set score:', rfc.score(X_train_tfidf, y_train))
print('\nTest set score:', rfc.score(X_test_tfidf, y_test))



Training set score: 0.9906962962962963

Test set score: 0.5111111111111111


In [105]:
from time import time
X = df['Tweets']
X_norm_tfidf = normalize(vectorizer1.fit_transform(X))

true_k = 10
labels = y
km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', batch_size=5000)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X_norm_tfidf)
mini_labels = km.labels_
print("Done in %0.3fs" % (time() - t0))
print()
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X_norm_tfidf, km.labels_, sample_size=5000))

Clustering sparse data with MiniBatchKMeans(batch_size=5000, compute_labels=True, init='k-means++',
        init_size=None, max_iter=100, max_no_improvement=10, n_clusters=10,
        n_init=3, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=0)
Done in 0.359s

Homogeneity: 0.033
Completeness: 0.062
V-measure: 0.043
Adjusted Rand-Index: 0.005
Silhouette Coefficient: 0.004


In [106]:
df['MiniBatchLabels'] = mini_labels

In [109]:
for label in set(mini_labels):
    print('Cluster: %d' % label)
    print(df[df.MiniBatchLabels == label].groupby('Org').count())
    print('')

Cluster: 0
                Tweets  MiniBatchLabels
Org                                    
BBCWorld          2043             2043
CNN               1642             1642
FoxNews           1761             1761
NPR               1887             1887
NYTimes           1715             1715
Reuters           1937             1937
USATODAY          1892             1892
WSJ               1768             1768
washingtonpost    1355             1355

Cluster: 1
                Tweets  MiniBatchLabels
Org                                    
BBCWorld            36               36
CNN                100              100
FoxNews             97               97
NPR                 79               79
NYTimes             49               49
Reuters             49               49
USATODAY            51               51
WSJ                 39               39
washingtonpost      70               70

Cluster: 2
                Tweets  MiniBatchLabels
Org                                    
BBCWo

In [113]:
#Let's check Cluster 0 - the largest, and likely a catch-all for tweets that did not fall into other clusters
shuffle(df[df.MiniBatchLabels == 0]).head(10)

Unnamed: 0,Tweets,Org,MiniBatchLabels
6536,the va s caregiver program was set up to suppo...,NPR,0
2043,more than migrants will be dropped off in el p...,CNN,0
1662,high speed trains are finally coming to the us,CNN,0
511,a new artificial intelligence technology can a...,CNN,0
12772,the south by southwest film festival has chose...,NYTimes,0
11146,as the partial government shutdown enters its ...,WSJ,0
19707,ethics panel expands probe of rep schweikert a...,washingtonpost,0
1165,rep cindy hyde smith is the first woman electe...,CNN,0
22248,two dead in marseille double building collapse,BBCWorld,0
11231,the republican controlled michigan legislature...,WSJ,0


In [114]:
#Let's check Cluster 1
shuffle(df[df.MiniBatchLabels == 1]).head(10)

Unnamed: 0,Tweets,Org,MiniBatchLabels
9721,incoming members of congress featuring a recor...,Reuters,1
20702,two men arrested in connection with roscommon ...,BBCWorld,1
1315,house minority leader nancy pelosi says democr...,CNN,1
5191,breaking president trump walked out of a meeti...,NPR,1
2733,lays out vision for dem controlled house it s ...,FoxNews,1
2594,breaking cnn s jim acosta press pass suspended...,FoxNews,1
1745,one thing the democrats say they will do when ...,CNN,1
9964,pelosi regains gavel as speaker of most divers...,Reuters,1
21907,white house aide ricardel removed after melani...,BBCWorld,1
14572,it s as if a skunk or multiple skunks in a fam...,NYTimes,1


In [115]:
#Cluster 2
shuffle(df[df.MiniBatchLabels == 2]).head(10)

Unnamed: 0,Tweets,Org,MiniBatchLabels
1336,live sen elect mitt romney talks to cnn about ...,CNN,2
6355,we have diplomatically relentlessly worked to ...,NPR,2
14210,president trump once declared that i alone can...,NYTimes,2
21531,nigeria metele attack president buhari speaks ...,BBCWorld,2
15144,i am not running for president at this time st...,USATODAY,2
16315,while wishing a merry christmas to troops over...,USATODAY,2
11991,president trump s attorney general pick critic...,WSJ,2
12288,two business partners of president trump s for...,WSJ,2
612,a national pilots association is urging presid...,CNN,2
12597,president trump canceled his trip to the econo...,NYTimes,2


### LSA with TF-IDF

In [62]:
print("Number of features: %d" % X_train.get_shape()[1])

Number of features: 21583


Let's see how many features we had in our TF-IDF training set.

In [None]:
print("Number of features: %d" % X_train_tfidf.get_shape()[1])

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space to about 10% of the original.
svd= TruncatedSVD(2100 ,n_iter=500)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of tweets our solution considers similar, for the first five identified topics
tweets_by_component=pd.DataFrame(X_train_lsa, index=list(X_train))
for i in range(5):
    print('Component {}:'.format(i))
    print(tweets_by_component.loc[:,i].sort_values(ascending=False)[0:10])
import time
print(time.time())

In [68]:
from sklearn.decomposition import TruncatedSVD
lsa = TruncatedSVD(n_components=5,n_iter=500)
lsa.fit(tfidf)
terms = vectorizer.get_feature_names()

for i,comp in enumerate(lsa.components_):
    termsInComp = zip(terms,comp)
    sortedterms = sorted(termsInComp, key=lambda x: x[1],reverse=True)[:10]
    print("Concept %d:" % i)
    for term in sortedterms:
        print(term[0])
    print(" ")

Concept 0:
trump
president
border
wall
shutdown
government
new
house
year
says
 
Concept 1:
new
year
day
know
need
briefing
old
start
morning
york
 
Concept 2:
know
need
day
briefing
morning
start
end
evening
shutdown
government
 
Concept 3:
government
shutdown
partial
federal
workers
house
senate
week
affected
pay
 
Concept 4:
wall
street
look
journal
page
border
early
mexico
year
emergency
 


In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space to about 10% of the original.
svd= TruncatedSVD(2100 ,n_iter=500)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of tweets our solution considers similar, for the first five identified topics
tweets_by_component=pd.DataFrame(X_train_lsa, index=list(X_train))
for i in range(5):
    print('Component {}:'.format(i))
    print(tweets_by_component.loc[:,i].sort_values(ascending=False)[0:10])

## Bigrams with TF-IDF

In [124]:
count_vect_bigram = CountVectorizer(ngram_range=(1,2))
train_data_bow_bigram = count_vect_bigram.fit_transform(X_train_tfidf)


AttributeError: lower not found

In [None]:
#Confusion matrix for Bigram tf-idf with Logistic Regression

lr = LogisticRegression()
lr.fit(train_data_bow_bigram,y_train)

print('Training set score:', lr.score(train_data_bow_bigram, y_train))
print('\nTest set score:', lr.score(test_data_bow_bigram, y_test))

y_pred = lr.predict(test_data_bow_bigram)

conf = confusion_matrix(y_test, y_pred, labels = handles)

cm = pd.DataFrame(conf, index = [i for i in handles],
                  columns = [i for i in handles])
plt.figure(figsize = (15,7))
sns.heatmap(cm, annot=True, cmap="YlOrRd", fmt = 'd')
crs = cross_val_score(lr, train_data_bow_bigram, y_train, cv=10)
print("Cross-Valid Bigram Fold Results are: ",crs)
print("Mean of Folds are = ",crs.mean())