<center>
    
### Data Mining Project 

### Project: US Presidential Impact on Afghanistan
### Team Members: Aditya Taori and Yuthika Shekhar
</center>

### Importing Libraries

In [None]:
##Importing Libraries
import re
import time
import string
from nltk.corpus import stopwords
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
%matplotlib inline
import nltk
from nltk.util import ngrams
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer,PorterStemmer
import plotly.express as px
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from matplotlib.cbook import flatten

### Defining Helper functions

### Creating Wordcloud

In [None]:
def create_wordcloud(text_col,figure_size = (15.0,10.0),title=None,title_size = 30):
    text = " ".join(text_col.values.tolist())
    wordcloud = WordCloud(background_color="white",random_state=42,width = 500,height = 500).generate(text)

    # Display the generated image:
    # the matplotlib way:
    plt.figure(figsize = figure_size)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    return wordcloud

### Creating Ngrams Dataframe

In [None]:
def ngrams_frequency_chart(df,text_col,ngram_val):
    y = " ".join(df[text_col].values.tolist())
    tokenization = nltk.word_tokenize(y)
    ngram_dataset = ngrams(tokenization,ngram_val)
    ngram_count = Counter(ngram_dataset)
    freq_uni_df = pd.DataFrame(ngram_count.most_common(30))
    freq_uni_df.columns = ['Ngrams','Count']
    return freq_uni_df

### Cleaning the Text

In [None]:
def data_cleaning(text):
    text = str(text).lower()  #Lowercasing the text
    text = re.sub('\[.*?\]', '', text)   
    text = re.sub('https?://\S+|www\.\S+', '', text)  #Removing Hyperlinks
    text = re.sub('<.*?>+', '', text)  #Removing HTML Tags
    table=str.maketrans('','',string.punctuation)  #Removing Punctuations
    text =text.translate(table) 
    text = re.sub('\n', '', text)  #Removing Newline Character
    text = re.sub('\w*\d\w*', '', text)  #Removing Non alphas character 
    text_tokens = word_tokenize(text) #Tokenizing the data
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words('english')] #Removing Stopwords
    return " ".join(tokens_without_sw)

### Setting Up Input Directories

In [None]:
input_dir = "E:/UOR Notes/Data Mining/Assignments/US Presidential Impact/Output_Files/CSV Files/Osama Bin Laden Killed/"
tweets_file = "Osama bin Laden Killed.csv"  #Tweets Dataset
users_file = "User_Info_Osama bin Laden Killed.csv"  #Users Dataset
tweet_file_path = input_dir+tweets_file 
user_file_path = input_dir+users_file

output_dir = "E:/UOR Notes/Data Mining/Assignments/US Presidential Impact/Output_Files/"  #Directory to save output files
visualization_folder = "Visualization Results/"  
event_name = "Osama Bin Laden Killed"

### Reading Tweets and User Dataset 

In [None]:
tweets_data = pd.read_csv(tweet_file_path)  #Reading Tweets Dataset 
users_data = pd.read_csv(user_file_path)    #Reading Users Dataset 

In [None]:
tweets_data.head()

### Creating datetime object for Tweets Published Date

In [None]:
time_val = tweets_data.created_at.str.replace(".000Z","")   
time_val = time_val.str.replace("T"," ")
tweets_data["Published_Date_Format"] =pd.to_datetime(time_val,format = "%Y-%m-%d %H:%M:%S")
tweets_data["Published_Date"] = tweets_data["Published_Date_Format"].dt.date

### Plotting Timeline Distribution for Tweets

In [None]:
fig,ax = plt.subplots(figsize = (15,8))
timeline_sentiment_distribution = tweets_data.groupby(["Published_Date"])["public_metrics.retweet_count"].count().plot(ax = ax)
#timeline_sentiment_distribution.save()
plt.title("Tweets Timeline Distribution")
plt.xlabel("Tweets Published Date")
plt.ylabel("Count of Tweets")
outfile_name = output_dir + visualization_folder+event_name+"_Timeline_Distribution.png"
plt.savefig(outfile_name)

### Creating Wordcloud of Original Tweets 

In [None]:
wc = create_wordcloud(tweets_data.text,title= event_name+"_Wordcloud")
outfile_name = output_dir + visualization_folder+event_name+"_Wordcloud.png"
wc.to_file(outfile_name)

### Creating Ngrams Dataframe 

In [None]:
uni_df = ngrams_frequency_chart(tweets_data,"text",ngram_val=1)
bigram_df = ngrams_frequency_chart(tweets_data,"text",ngram_val=2)
trigram_df = ngrams_frequency_chart(tweets_data,"text",ngram_val=3)

### Plotting Unigram Frequency

In [None]:
plt.figure(figsize=(15,15))
uni_df.plot.bar(x="Ngrams",y="Count",title= "Unigram Frequency")
plt.xlabel("Unigrams")
plt.ylabel("Count")
outfile_name = output_dir + visualization_folder+event_name+"_Unigram_Frequency.png"
plt.savefig(outfile_name)
plt.show()

### Plotting Bigram Frequency Bar Chart  

In [None]:
plt.figure(figsize=(15,15))
bigram_df.plot.bar(x="Ngrams",y="Count",title= "Bigram Frequency")
plt.xlabel("Bigrams")
plt.ylabel("Count")

outfile_name = output_dir + visualization_folder+event_name+"_Bigram_Frequency.png"
plt.savefig(outfile_name)
plt.show()

### Plotting Trigram Frequency Bar Chart  

In [None]:
plt.figure(figsize=(15,15))
trigram_df.plot.bar(x="Ngrams",y="Count",title= "Trigram Frequency")
plt.xlabel("Trigrams")
plt.ylabel("Count")

outfile_name = output_dir + visualization_folder+event_name+"_Trigram_Frequency.png"
plt.savefig(outfile_name)
plt.show()

### Fetching Hashtags and Usertags from tweets

In [None]:
tweets_data["user_tags"] = tweets_data["text"].str.findall('@[a-zA-Z0-9_]+')
tweets_data["hashtags"] = tweets_data["text"].str.findall('#[a-zA-Z0-9_]+')

hashtags = tweets_data["hashtags"].tolist()
list_hashtags={'Hashtags':[hashtags]}
for a,i in list_hashtags.items() :
    hashtags_df=pd.DataFrame.from_dict(dict(Counter([*flatten(i)])), orient ='index').reset_index().rename(columns ={'index':a,0:str(a)+'_count'})
sorted_hc = hashtags_df.sort_values(["Hashtags_count"],ascending=False)

sorted_hc =sorted_hc.reset_index()
sorted_hc


In [None]:
tags = tweets_data["user_tags"].tolist()
list_tags={'UserTags':[tags]}
for a,i in list_tags.items() :
    user=pd.DataFrame.from_dict(dict(Counter([*flatten(i)])), orient ='index').rename(columns ={'index':a,0:str(a)+'_count'})
sorted_tags = user.sort_values(["UserTags_count"],ascending=False)
sorted_tags =sorted_tags.reset_index() 
sorted_tags.columns = ["Tags","Count"]
sorted_tags["Tags_Proporton"] = sorted_tags["Count"]/sorted_tags.sum().Count
sorted_tags

In [None]:
total_tags =sorted_tags.sum().Count
total_tags

### Plotting  Top 20 Hashtags from tweets

In [None]:
ax = sorted_hc.iloc[0:20].plot.bar(x="Hashtags",y="Hashtags_count", 
                              color = "Orange",
                              rot = 90,
                              figsize=(10,5),
                              title = "Top 20 Hashtags",
                              xlabel =" Hashtags",
                              ylabel = "Hashtags Count")
fig = ax.get_figure()
outfile_name = output_dir + visualization_folder+event_name+"_Top_20_Hashtags.png"
fig.savefig(outfile_name)

### Plotting Top 20 Usertags from tweets

In [None]:
ax = sorted_tags.iloc[0:20].plot.bar(x="Tags",y="Count", 
                              color = "Purple",
                              rot = 90,
                              figsize=(10,5),
                              title = "Top 20 Usertags",
                              xlabel = "Usertags",
                              ylabel = "Usertags Count")
fig = ax.get_figure()
outfile_name = output_dir + visualization_folder+event_name+"_Top_20_Usertags.png"
fig.savefig(outfile_name)

### Applying Data Cleaning on Tweets

In [None]:
cleaned_data = tweets_data["text"].apply(data_cleaning)

In [None]:
tweets_data["cleaned_text"] = cleaned_data

### Wordcloud of Cleaned Data 

In [None]:
wc = create_wordcloud(tweets_data.cleaned_text,title= event_name+"_Cleaned Data Wordcloud")
outfile_name = output_dir + visualization_folder+event_name+"_Cleaned_Data_Wordcloud.png"
wc.to_file(outfile_name)

### Cleaned Data Ngrams dataframe

In [None]:
cleaned_uni_df = ngrams_frequency_chart(tweets_data,"cleaned_text",ngram_val=1)
cleaned_bigram_df = ngrams_frequency_chart(tweets_data,"cleaned_text",ngram_val=2)
cleaned_trigram_df = ngrams_frequency_chart(tweets_data,"cleaned_text",ngram_val=3)

### Cleaned Unigram Frequency Bar Plot

In [None]:
plt.figure(figsize=(15,15))
cleaned_uni_df.plot.bar(x="Ngrams",y="Count",title= "Unigram Frequency")
plt.xlabel("Unigrams")
plt.ylabel("Count")
#plt.show()
outfile_name = output_dir + visualization_folder+event_name+"_Cleaned_Unigram_Frequency.png"
plt.savefig(outfile_name)
plt.show()


### Cleaned Bigram Frequency Bar Plot

In [None]:
plt.figure(figsize=(15,15))
cleaned_bigram_df.plot.bar(x="Ngrams",y="Count",title= "Bigram Frequency")
plt.xlabel("Bigrams")
plt.ylabel("Count")
#plt.show()
outfile_name = output_dir + visualization_folder+event_name+"_Cleaned_Bigram_Frequency.png"
plt.savefig(outfile_name)
plt.show()


### Cleaned Trigram Frequency Bar Plot

In [None]:
plt.figure(figsize=(15,15))
cleaned_trigram_df.plot.bar(x="Ngrams",y="Count",title= "Trigram Frequency")
plt.xlabel("Trigrams")
plt.ylabel("Count")
outfile_name = output_dir + visualization_folder+event_name+"_Cleaned_Trigram_Frequency.png"
plt.savefig(outfile_name)
plt.show()

### Running Sentiment Analysis on tweets with a threshold of 0.05 

In [None]:
sid = SentimentIntensityAnalyzer()

tweets_data['score'] = tweets_data['text'].apply(lambda review: sid.polarity_scores(review))
tweets_data['compound'] = tweets_data['score'].apply(lambda scores_values: scores_values['compound'])
compound = tweets_data['Sentiments'] = tweets_data['compound'].apply(lambda value : 'pos' if value > 0.05 else 
                                                   ('neg' if value <= -0.05 else 'neutral'))

### Merging Tweets Dataset and Users Dataset 

In [None]:
tweets_data.columns

In [None]:
author_id = users_data['author_id_str'].unique().tolist()
#author_id = users_data['Str_id'].unique().tolist()
len(author_id)

users_data = users_data.drop_duplicates(subset=['author_id_str'])
users_data.shape

#join = pd.merge(tweets_data,users_data,on='author_id_str',how='left')
join = pd.merge(tweets_data,users_data,left_on='id_str',right_on='author_id_str',how='left')

join.shape

### Getting the sentiment counts

In [None]:
sentiment_count = join.groupby("Sentiments")["compound"].count()
sentiment_count

In [None]:
join.to_csv(event_name+"Sentiment_Analysis_Output.csv")

### Getting the sentiment Proportion

In [None]:
add = sum(sentiment_count)

perc = sentiment_count/add * 100
perc



### Sentiment Distribution Bar Plot

In [None]:
outfile_name = output_dir + visualization_folder+event_name+"_Sentiment_Distribution.png"
sns.barplot(x = perc.index,y=perc.values.tolist())
plt.xlabel("Sentiments")
plt.ylabel("Percentage")
plt.title(event_name+"_Sentiment Distribution")
plt.savefig(outfile_name)
plt.show()

### Ngrams Dataframe for Positive and Negative Sentiments

In [None]:
pos_df = join[join["Sentiments"]=="pos"]
neg_df = join[join["Sentiments"]=="neg"]
pos_uni_df = ngrams_frequency_chart(pos_df,"cleaned_text",ngram_val=1)
pos_bigram_df = ngrams_frequency_chart(pos_df,"cleaned_text",ngram_val=2)
pos_trigram_df = ngrams_frequency_chart(pos_df,"cleaned_text",ngram_val=3)
neg_uni_df = ngrams_frequency_chart(neg_df,"cleaned_text",ngram_val=1)
neg_bigram_df = ngrams_frequency_chart(neg_df,"cleaned_text",ngram_val=2)
neg_trigram_df = ngrams_frequency_chart(neg_df,"cleaned_text",ngram_val=3)

### Positive Sentiments Unigram Frequency

In [None]:
plt.figure(figsize=(15,15))
pos_uni_df.plot.bar(x="Ngrams",y="Count",title= "Unigram Frequency")
plt.xlabel("Unigram")
plt.ylabel("Count")

outfile_name = output_dir + visualization_folder+event_name+"_Positive_Sentiment_Unigram_Frequency.png"
plt.savefig(outfile_name)
plt.show()

### Positive Sentiments Bigram Frequency

In [None]:
plt.figure(figsize=(15,15))
pos_bigram_df.plot.bar(x="Ngrams",y="Count",title= "Bigram Frequency")
plt.xlabel("Bigram")
plt.ylabel("Count")

outfile_name = output_dir + visualization_folder+event_name+"_Positive_Sentiment_Bigram_Frequency.png"
plt.savefig(outfile_name)
plt.show()

### Positive Sentiments Trigram Frequency

In [None]:
plt.figure(figsize=(15,15))
pos_trigram_df.plot.bar(x="Ngrams",y="Count",title= "Trigram Frequency")
plt.xlabel("Trigram")
plt.ylabel("Count")

outfile_name = output_dir + visualization_folder+event_name+"_Positive_Sentiment_Trigram_Frequency.png"
plt.savefig(outfile_name)
plt.show()

### Negative Sentiments Unigram Frequency

In [None]:
plt.figure(figsize=(15,15))
neg_uni_df.plot.bar(x="Ngrams",y="Count",title= "Unigram Frequency")
plt.xlabel("Unigram")
plt.ylabel("Count")

outfile_name = output_dir + visualization_folder+event_name+"_Negative_Sentiment_Unigram_Frequency.png"
plt.savefig(outfile_name)
plt.show()

### Negative Sentiments Bigram Frequency

In [None]:
plt.figure(figsize=(15,15))
neg_bigram_df.plot.bar(x="Ngrams",y="Count",title= "Bigram Frequency")
plt.xlabel("Bigram")
plt.ylabel("Count")

outfile_name = output_dir + visualization_folder+event_name+"_Negative_Sentiment_Bigram_Frequency.png"
plt.savefig(outfile_name)
plt.show()

### Negative Sentiment Trigram Frequency

In [None]:
plt.figure(figsize=(15,15))
neg_trigram_df.plot.bar(x="Ngrams",y="Count",title= "Trigram Frequency")
plt.xlabel("Trigram")
plt.ylabel("Count")

outfile_name = output_dir + visualization_folder+event_name+"_Negative_Sentiment_Trigram_Frequency.png"
plt.savefig(outfile_name)
plt.show()

### Positive Sentiments Wordcloud

In [None]:
pos_wc = create_wordcloud(pos_df.cleaned_text,title= event_name+"_Positive_Sentiments_Wordcloud")
outfile_name = output_dir + visualization_folder+event_name+"_Positive_Sentiments_Wordcloud.png"
pos_wc.to_file(outfile_name)

### Negative Sentiments Wordcloud

In [None]:
neg_wc = create_wordcloud(neg_df.cleaned_text,title= event_name+"_Negative_Sentiments_Wordcloud")
outfile_name = output_dir + visualization_folder+event_name+"_Negative_Sentiments_Wordcloud.png"
neg_wc.to_file(outfile_name)

In [None]:
time_val = join.created_at_x.str.replace(".000Z","")
time_val = time_val.str.replace("T"," ")
join["Published_Date_Format"] =pd.to_datetime(time_val,format = "%Y-%m-%d %H:%M:%S")
join["Published_Date"] = join["Published_Date_Format"].dt.date

### Sentiments Timeline Distribution 

In [None]:
fig,ax = plt.subplots(figsize = (15,8))
timeline_sentiment_distribution = join.groupby(["Published_Date","Sentiments"])["Sentiments"].count().unstack().plot(ax = ax)
#timeline_sentiment_distribution.save()
ax.set(xlabel = "Published Date",ylabel= "Count",title="Timeline Distribution of Sentiment" )
outfile_name = output_dir + visualization_folder+event_name+"_Sentiments_Timeline_Distribution.png"
plt.savefig(outfile_name)
#timeline_sentiment_distribution
#for key, grp in timeline_sentiment_distribution:
#    ax.plot(grp['Published_Date'], grp['Sentiments'], label=key)