# first dataset Elon_musk tweets

### ONE:
1) Perform sentimental analysis on the Elon-musk tweets

In [1]:
# Install Libraries if not installed
#%pip install spacy
#!python -m spacy download en_core_web_md
#!pip install wordcloud


In [2]:
# Import libraries
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import string
import spacy 
import nltk
from matplotlib.pyplot import imread
from wordcloud import WordCloud, STOPWORDS


TypeError: dataclass_transform() got an unexpected keyword argument 'field_specifiers'

In [None]:
# load the dataset
Elon=pd.read_csv("Elon_musk.csv",encoding='Latin-1')
Elon.drop(['Unnamed: 0'],inplace=True,axis=1)
Elon

### Text Preprocessing

In [None]:
Elon=[Text.strip() for Text in Elon.Text] # remove both the leading and the trailing characters
Elon=[Text for Text in Elon if Text] # removes empty strings, because they are considered in Python as False
Elon[0:10]

In [None]:
# Joining the list into one string/text
Elon_text=' '.join(Elon)
Elon_text

In [None]:
# remove Twitter username handles from a given twitter text. (Removes @usernames)
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(strip_handles=True)
Elon_tokens=tknzr.tokenize(Elon_text)
print(Elon_tokens)

In [None]:
# Again Joining the list into one string/text
Elon_tokens_text=' '.join(Elon_tokens)
Elon_tokens_text

In [None]:
# Remove Punctuations 
Punctuations =Elon_tokens_text.translate(str.maketrans('','',string.punctuation))
Punctuations 

In [None]:
# remove https or url within text
import re
url=re.sub(r'http\S+', '', Punctuations )
url

In [None]:
#Tokenization
from nltk.tokenize import word_tokenize
nltk.download('punkt')
text_tokens=word_tokenize(url)
print(text_tokens)

In [None]:
# Tokens count
len(text_tokens)

In [None]:
# Remove Stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
my_stop_words=stopwords.words('english')

sw_list = ['\x92','rt','ye','yeah','haha','Yes','U0001F923','I']
my_stop_words.extend(sw_list)

no_stop_tokens=[word for word in text_tokens if not word in my_stop_words]
print(no_stop_tokens)

In [None]:
# Normalize the data
lower_words=[Text.lower() for Text in no_stop_tokens]
print(lower_words[100:200])

In [None]:
# Stemming (Optional)
from nltk.stem import PorterStemmer
ps=PorterStemmer()
stemmed_tokens=[ps.stem(word) for word in lower_words]
print(stemmed_tokens[100:200])

In [None]:
# Lemmatization
#!python -m spacy download en
nlp=spacy.load('en_core_web_sm')
doc=nlp(' '.join(lower_words))
print(doc)

In [None]:
lemmas=[token.lemma_ for token in doc]
print(lemmas)

In [None]:
clean_tweets=' '.join(lemmas)
clean_tweets

### Feature Extaction

#### 1. Using CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
tweetscv=cv.fit_transform(lemmas)

In [None]:
print(cv.vocabulary_)

In [None]:
print(cv.get_feature_names()[100:200])

In [None]:
print(tweetscv.toarray()[100:200])

In [None]:
print(tweetscv.toarray().shape)

#### 2. CountVectorizer with N-grams (Bigrams & Trigrams)

In [None]:
cv_ngram_range=CountVectorizer(analyzer='word',ngram_range=(1,3),max_features=100)
bow_matrix_ngram=cv_ngram_range.fit_transform(lemmas)

In [None]:
print(cv_ngram_range.get_feature_names())
print(bow_matrix_ngram.toarray())

#### 3. TF-IDF Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfv_ngram_max_features=TfidfVectorizer(norm='l2',analyzer='word',ngram_range=(1,3),max_features=500)
tfidf_matix_ngram=tfidfv_ngram_max_features.fit_transform(lemmas)

In [None]:
print(tfidfv_ngram_max_features.get_feature_names())
print(tfidf_matix_ngram.toarray())

#### 4. Generate Word Cloud

In [None]:
# Define a function to plot word cloud
def plot_cloud(wordcloud):
    plt.figure(figsize=(40,30))
    plt.imshow(wordcloud)
    plt.axis('off')
    
# Generate Word Cloud

STOPWORDS.add('pron')
STOPWORDS.add('rt')
STOPWORDS.add('yeah')
wordcloud=WordCloud(width=3000,height=2000,background_color='black',max_words=50,
                   colormap='Set1',stopwords=STOPWORDS).generate(clean_tweets)
plot_cloud(wordcloud)

#### 5. Named Entity Recognition (NER)

In [None]:
# Parts Of Speech (POS) Tagging
nlp=spacy.load('en_core_web_sm')

one_block=clean_tweets
doc_block=nlp(one_block)
spacy.displacy.render(doc_block,style='ent',jupyter=True)

In [None]:
for token in doc_block[100:200]:
    print(token,token.pos_)   

In [None]:
# Filtering the nouns and verbs only
nouns_verbs=[token.text for token in doc_block if token.pos_ in ('NOUN','VERB')]
print(nouns_verbs[100:200])

In [None]:
# Counting the noun & verb tokens
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

X=cv.fit_transform(nouns_verbs)
sum_words=X.sum(axis=0)

words_freq=[(word,sum_words[0,idx]) for word,idx in cv.vocabulary_.items()]
words_freq=sorted(words_freq, key=lambda x: x[1], reverse=True)

wd_df=pd.DataFrame(words_freq)
wd_df.columns=['word','count']
wd_df[0:10] # viewing top ten results

In [None]:
# Visualizing results (Barchart for top 10 nouns + verbs)
wd_df[0:10].plot.bar(x='word',figsize=(12,8),title='Top 10 nouns and verbs');

#### 6. Emotion Mining - Sentiment Analysis

In [None]:
from nltk import tokenize
sentences=tokenize.sent_tokenize(' '.join(Elon))
sentences

In [None]:
sent_df=pd.DataFrame(sentences,columns=['sentence'])
sent_df

# second dataset Afinn

In [None]:
afin = pd.read_csv("Afinn.csv",  encoding='latin-1')
afin
#sep=',',

In [None]:
affinity_scores=afin.set_index('word')['value'].to_dict()
affinity_scores

In [None]:
# Custom function: score each word in a sentence in lemmatised form, but calculate the score for the whole original sentence
nlp=spacy.load('en_core_web_sm')
sentiment_lexicon=affinity_scores

def calculate_sentiment(text:str=None):
    sent_score=0
    if text:
        sentence=nlp(text)
        for word in sentence:
            sent_score+=sentiment_lexicon.get(word.lemma_,0)
    return sent_score

In [None]:
# manual testing
calculate_sentiment(text='great')

In [None]:
# Calculating sentiment value for each sentence
sent_df['sentiment_value']=sent_df['sentence'].apply(calculate_sentiment)
sent_df['sentiment_value']

In [None]:
# how many words are there in a sentence?
sent_df['word_count']=sent_df['sentence'].str.split().apply(len)
sent_df['word_count']

In [None]:
sent_df.sort_values(by='sentiment_value')

In [None]:
# Sentiment score of the whole review
sent_df['sentiment_value'].describe()

In [None]:
# negative sentiment score of the whole review
sent_df[sent_df['sentiment_value']<=0]

In [None]:
# positive sentiment score of the whole review
sent_df[sent_df['sentiment_value']>0]

In [None]:
# Adding index cloumn
sent_df['index']=range(0,len(sent_df))
sent_df

In [None]:
# Plotting the sentiment value for whole review
import seaborn as sns
plt.figure(figsize=(15,10))
sns.distplot(sent_df['sentiment_value'])

In [None]:
# Plotting the line plot for sentiment value of whole review
plt.figure(figsize=(15,10))
sns.lineplot(y='sentiment_value',x='index',data=sent_df)

In [None]:
# Correlation analysis
sent_df.plot.scatter(x='word_count',y='sentiment_value',figsize=(8,8),title='Sentence sentiment value to sentence word count');