Data manipulation and EDA for the training and testing data

In [59]:
# Run the imports/functions file
%run ./imports_functions.ipynb

# Import twitter samples and stopwords from the NLTK corpus
nltk.download('twitter_samples', quiet=True)
nltk.download('stopwords', quiet=True)

all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [60]:
# Get unique values and value counts using splitTweets function from imports_functions.ipynb for positive and negative tweets
positive_counts = splitTweets(all_positive_tweets)
negative_counts = splitTweets(all_negative_tweets)

In [61]:
# Head of positive tweets value counts
positive_counts.head(10)

Unnamed: 0,unique_values,counts
0,:),3568
1,:-),692
2,:d,629
3,thank,620
4,love,400
5,follow,381
6,...,289
7,day,246
8,good,238
9,like,233


In [62]:
# Head of negative tweets value counts
negative_counts.head(10)

Unnamed: 0,unique_values,counts
0,:(,4571
1,:-(,493
2,i'm,343
3,...,331
4,miss,301
5,pleas,275
6,follow,262
7,want,246
8,get,232
9,like,228


In [63]:
# Variables for top 100 negative tweets and top 100 positive tweets
negative_top_100 = negative_counts.head(100)
positive_top_100 = positive_counts.head(100)

# Get the top frequent words in tweets without the 100 top frequent words in the other tweets
positive_without_top_neg_100 = positive_counts.merge(negative_top_100.unique_values, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='left_only']
negative_without_top_pos_100 = negative_counts.merge(positive_top_100.unique_values, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='left_only']
positive_without_top_neg_100 = positive_without_top_neg_100.drop(["_merge"], axis = 1)
negative_without_top_pos_100 = negative_without_top_pos_100.drop(["_merge"], axis = 1)

In [64]:
# Display the above (positive)
positive_without_top_neg_100.index = range(len(positive_without_top_neg_100.index))
positive_without_top_neg_100.head(15)

Unnamed: 0,unique_values,counts
0,:),3568.0
1,:-),692.0
2,:d,629.0
3,happi,211.0
4,hi,173.0
5,great,171.0
6,:p,138.0
7,<3,134.0
8,friday,116.0
9,morn,101.0


In [65]:
# Display the above (negative)
negative_without_top_pos_100.index = range(len(negative_without_top_pos_100.index))
negative_without_top_pos_100.head(15)

Unnamed: 0,unique_values,counts
0,:(,4571.0
1,:-(,493.0
2,miss,301.0
3,♛,210.0
4,》,210.0
5,can't,180.0
6,feel,158.0
7,sorri,148.0
8,sad,123.0
9,wanna,94.0


In [66]:
# Calculate whether the tweets are unique
# Display commented out for presentation

percent_unique_positive = pd.DataFrame(all_positive_tweets).nunique()/pd.DataFrame(all_positive_tweets).count()
# print("Unique positive tweets/total positive tweets:",int(percent_unique_positive)*100, "%")

percent_unique_negative= pd.DataFrame(all_negative_tweets).nunique()/pd.DataFrame(all_negative_tweets).count()
# print("Unique negative tweets/total negative tweets:",int(percent_unique_negative)*100, "%")

In [67]:
# Read 1000 most common words in the English language from downloaded file
most_common_words = pd.read_csv("words.csv", header = None, names = ["unique_values"])

# Remove 1000 most common words in the English language from positive and negative tweets
positive_unique = positive_counts.merge(most_common_words, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='left_only']
positive_unique = positive_unique.drop(["_merge"], axis = 1)
positive_unique.index = range(len(positive_unique.index))

negative_unique =negative_counts.merge(most_common_words, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='left_only']
negative_unique = negative_unique.drop(["_merge"], axis = 1)
negative_unique.index = range(len(negative_unique.index))

In [68]:
# Display the above
positive_unique.head(15)

Unnamed: 0,unique_values,counts
0,:),3568.0
1,:-),692.0
2,:d,629.0
3,...,289.0
4,happi,211.0
5,hi,173.0
6,:p,138.0
7,<3,134.0
8,..,128.0
9,friday,116.0


In [69]:
# Display the above
negative_unique.head(15)

Unnamed: 0,unique_values,counts
0,:(,4571.0
1,:-(,493.0
2,...,331.0
3,pleas,275.0
4,♛,210.0
5,》,210.0
6,can't,180.0
7,sorri,148.0
8,realli,131.0
9,sad,123.0


In [70]:
# Create dictionaries for word cloud (key is the word, value is the frequency)
pos_dict = {}
for i in range(len(positive_unique.unique_values)):
  pos_dict[positive_unique.iloc[i].unique_values] = positive_unique.iloc[i].counts

neg_dict = {}
for i in range(len(negative_unique.unique_values)):
  neg_dict[negative_unique.iloc[i].unique_values] = negative_unique.iloc[i].counts

In [71]:
# Read in the mask shape and set stopwords
twitter = np.array(Image.open("twitter.png"))
stopwords = set(STOPWORDS)

# Set wordcloud parameters and save to file
# Display disabled for presentation
wc = WordCloud(stopwords=stopwords, font_path = "VeraMono.ttf", background_color="black", mask = twitter, max_words = 200, colormap = "Greens").fit_words(pos_dict)
wc.to_file('pos_cloud.png')

<wordcloud.wordcloud.WordCloud at 0x247b768eb20>

In [72]:
# Might have to run a few times due to randomly generated positioning
# Set wordcloud parameters and save to file
# Display disabled for presentation
wc = WordCloud(max_font_size = 250, font_path = "VeraMono.ttf",stopwords=stopwords, background_color="black", colormap = "Reds", mask = twitter, max_words = 200).fit_words(neg_dict)
wc.to_file('neg_cloud.png')

<wordcloud.wordcloud.WordCloud at 0x247b748dfa0>