In [2]:
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [3]:
from google.colab import drive
drive.mount('/content/drive')
url = '/content/drive/My Drive/NLTK/edited_tweets.csv'
data = pd.read_csv(url)

Mounted at /content/drive


In [4]:
df = data[['id','content']]

In [None]:
df.isnull().sum()

id         0
content    0
dtype: int64

In [None]:
  df.dtypes

id          int64
content    object
dtype: object

In [5]:
nlp = spacy.load('en_core_web_sm')

In [5]:
nlp.components

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f3e92e40340>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7f3e92e40280>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7f3e92e3ec10>),
 ('senter', <spacy.pipeline.senter.SentenceRecognizer at 0x7f3e92e26a00>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7f3e92b76bc0>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7f3e92adfac0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7f3e92e3e970>)]

In [6]:
nlp.component_names

['tok2vec',
 'tagger',
 'parser',
 'senter',
 'attribute_ruler',
 'lemmatizer',
 'ner']

In [7]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

### Using Spacy TextBlob: Sentiment Analysis using TextBlob

In [52]:
%pip install spacytextblob
from spacytextblob.spacytextblob import SpacyTextBlob

In [59]:
nlp.add_pipe('spacytextblob')

<spacytextblob.spacytextblob.SpacyTextBlob at 0x7ff3bb773fd0>

In [60]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'spacytextblob']

In [55]:
import spacy
nlp = spacy.load('en_core_web_sm')

## Working with SpaCy
- Removing Stopwords
- Lemmatizing
- Stemmeing

In [61]:
tweets = list(data['content'])

#### Stop words 

In [32]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
def remove_stop(text):
  stop_words = set(stopwords.words('english'))
  word_tokens = word_tokenize(text.lower())

  res = ""
  for word in word_tokens:
    if word not in stop_words:
      res += f"{word} "
    
  return res

res = []
for tweet in tweets:
  res.append(remove_stop(tweet))

token = pd.Series(res)
df['tokenize'] = token.values

#### Lemmantization with spacy

In [38]:
res = []
for tweet in tweets:
  text = tweet
  tokens = nlp(text)
  res_str = ""
  for token in tokens:
    res_str += f" {token.lemma_}"
  res.append(res_str)

df['lemmantize'] = (pd.Series(res)).values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lemmantize'] = (pd.Series(res)).values


In [None]:
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer

res = []
ps = PorterStemmer()

for tweet in tweets:
  test = tweet 
  tokenization = nltk.word_tokenize(text)
  res_str = ""
  for w in tokenization:
    rootWord = ps.stem(w)
    res_str += f" {rootWord}"
  res.append(res_str)

df['Stemmer'] = (pd.Series(res)).values

In [76]:
docx = nlp(tweets[0])
print(docx._.polarity)

0.0


In [82]:
lemman = list(df['lemmantize'])


[' GPT4   GPT4   airdrop       HEX AI   AI   AI',
 ' ne par l   intelligence   open ai tu te',
 ' glad to see   GPT4 be 10x now   and still    airdrop      BLUR    DOGE   GPT4',
 ' wow   just watch the late South Park episode and my be still all the hilarious and clever commentary   can   t believe I    get to write this post about it too',
 ' struggle to find the perfect   try our prompt generator tool   for endless inspiration   give it a go   marketing',
 ' gpt4   10    airdrop     AI    PAW   AI   DOGE   APT',
 ' an interesting read on in journalism by',
 ' be ready to enter the world of AI     GPT4   airdrop     AI AI GPT4   BONK     GPT4   DOGE   GPT4',
 ' I en mi    la no usar   el de',
 ' opinion   Government should go slow when AI ai comment gift article technology',
 ' plus not work again   it   s down like every weekday',
 ' sorry    t go anywhere   the good news be that   t necessarily mean the end of learn   some be find way to integrate AI into the classroom   tech',
 ' 9

In [86]:
df['lemmantize']=df['lemmantize'].astype('str').replace({'\d+': np.nan, 'nan': np.nan}, regex=True).astype('object')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lemmantize']=df['lemmantize'].astype('str').replace({'\d+': np.nan, 'nan': np.nan}, regex=True).astype('object')


In [87]:

df['lemmantize'].sample(29)

59907                                                  NaN
7335                                    US     la d   lien
59808                                                  NaN
66456                                                  NaN
65295                                                  NaN
52225                                                  NaN
44007     how be you with in order to save a little tim...
9203           any idea by when it be available in Bard AI
4744      to be a guest on the Convergence    Thrilling...
43333     De impact van   Lees de van de expert   En   ...
32249                                                  NaN
16761     text datum clean with   my parser be text wit...
7820      if you ask about a feature not   so   popular...
52250                                                  NaN
17452                         lady and be make history now
33245                              never ask about regular
19617                                                  N

In [84]:
polarity =[]
for tweet in lemman:
  text = nlp(tweet)
  if isinstance(text,str):
    polar = text.sentiment.polarity
  else:
    polar = "Nan"  
    
  polarity.append(polar)




In [85]:
print(polarity)

['Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan'

In [80]:
from textblob.en import subjectivity
def get_sentiment(tweet):
  text = nlp(tweet)
  if isinstance(text,str):
    polarity = text.sentiment.polarity
    # subjectivity = text.sentiment.subjectivity  
  else:
    polarity = "Nan"
    # subjectivity = "Nan"
  # result = {'Polarity':polarity,
            # 'Subjectivity':subjectivity}

# ??nlp.pipe_names
df['sentiment'] = df['lemmantize'].apply(get_sentiment) 

KeyboardInterrupt: ignored

In [75]:

df['sentiment'].unique

<bound method Series.unique of 0        None
1        None
2        None
3        None
4        None
         ... 
77330    None
77331    None
77332    None
77333    None
77334    None
Name: sentiment, Length: 77335, dtype: object>

In [29]:
import nltk 
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def ge_sentiment(text):
    blob = TextBlob(text).sentiment
    sentiment_polarity = blob.sentiment.polarity 
    sentiment_subjectivity = blob.sentiment.subjectivity 
    if sentiment_polarity = 0:
        sentiment_label = 'Positive'
    elit sentiment_polarity < 0:
        sentiment_label = 'Negative'
    else 
        sentiment_label = 'Neutral'

IndentationError: expected an indented block after function definition on line 1 (4036910598.py, line 1)