In [7]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download NLTK resources
nltk.download('vader_lexicon')

# Read news data from CSV
news_df = pd.read_csv('news_data1.csv', nrows=100000)  # Replace 'your_file.csv' with the actual file path

# Assuming the CSV file has a 'contentmn
news_data = news_df['content'].tolist()

# Initialize VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Function to classify news sentiment using VADER
def classify_news(news_text):
    # Perform sentiment analysis
    sentiment_scores = sid.polarity_scores(news_text)

    # Determine sentiment label based on compound score
    if sentiment_scores['compound'] >= 0.5:
        return 'positive'
    elif sentiment_scores['compound'] <= -0.5:
        return 'negative'
    else:
        return 'neutral'

# Add a new column to the DataFrame to store sentiment classification results
news_df['sentiment'] = news_df['content'].apply(classify_news)
news_df[['content', 'sentiment']]

# Save the DataFrame back to the CSV file with the new column
news_df.to_csv('india_sentiment.csv', index=False)  # Replace 'your_file_with_sentiment.csv' with the desired file path

# Display the DataFrame with the sentiment column
print(news_df.head())


                                                link  \
0  https://www.travelandtourworld.com/news/articl...   
1  https://blivenews.com/yatra/kerala-to-cement-s...   
2  https://english.mathrubhumi.com/features/trave...   
3  https://www.indulgexpress.com/travel/2023/Jul/...   
4  https://www.travelandtourworld.com/news/articl...   

                                               title  \
0  Kerala Tourism Minister reveals logos for Inte...   
1  Kerala to Cement Slot in Global Adventure Tour...   
2  Kerala RTC offers trip to celebrate New Year a...   
3                                   Relish the rains   
4  Kerala gears up for four major international a...   

                                             snippet          date  \
0  Today, Tourism Minister Shri P A Mohamed Riyas...  24 hours ago   
1  Kerala to Cement Slot in Global Adventure Tour...   11 Jan 2024   
2  If you are wondering where to spend New Year, ...   11 Dec 2023   
3  Often called the gateway to tourism in Kera

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Read news data from CSV
news_df = pd.read_csv('news_data1.csv', nrows=100000)  # Replace 'your_file.csv' with the actual file path

# Initialize VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Remove special characters, punctuations (except exclamation marks) and extra whitespaces
    cleaned_text = ' '.join([word.lower() for word in tokens if word.isalnum() or word == '!'])

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in cleaned_text.split()]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in lemmatized_words if word not in stop_words]

    return ' '.join(filtered_words)

# Function to classify news sentiment using VADER
def classify_news(news_text):
    # Perform sentiment analysis
    sentiment_scores = sid.polarity_scores(news_text)

    # Determine sentiment label based on compound score
    if sentiment_scores['compound'] >= 0.5:
        return 'positive'
    elif sentiment_scores['compound'] <= -0.5:
        return 'negative'
    else:
        return 'neutral'

# Apply pre-processing to news data
news_df['cleaned_content'] = news_df['content'].apply(preprocess_text)

# Add a new column to the DataFrame to store VADER sentiment classification results
news_df['vader_sentiment'] = news_df['cleaned_content'].apply(classify_news)
news_df = news_df[['content', 'vader_sentiment']]

# Save the DataFrame back to the CSV file with the new column
news_df.to_csv('india_vader.csv', index=False)  # Replace 'your_file_with_vader_sentiment.csv' with the desired file path

# Display the DataFrame with the VADER sentiment column
print(news_df.head())


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                             content vader_sentiment
0  Wednesday, March 6, 2024Favorite Today, Touris...        positive
1  Thiruvananthapuram: In an effort to firmly sec...        positive
2  ENGLISH MALAYALAM NEWSPAPER E-Paper More+ 900 ...        positive
3  For nature lovers, there could be nothing more...        positive
4  Thursday, January 11, 2024Favorite Kerala gear...        positive


In [None]:
from google.cloud import language_v1
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import os

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Set environment variable for authentication
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "path/to/your/service_account_key.json"  # Replace with your service account key file path

# Initialize the Natural Language API client
client = language_v1.LanguageServiceClient()

# Preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Remove special characters, punctuations (except exclamation marks) and extra whitespaces
    cleaned_text = ' '.join([word.lower() for word in tokens if word.isalnum() or word == '!'])

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in cleaned_text.split()]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in lemmatized_words if word not in stop_words]

    return ' '.join(filtered_words)

# Function to classify news sentiment using Google Cloud Natural Language API
def classify_news(news_text):
    # Analyze sentiment
    document = {"content": news_text, "type": language_v1.Document.Type.PLAIN_TEXT}
    response = client.analyze_sentiment(request={'document': document})
    sentiment = response.document_sentiment

    # Determine sentiment label based on sentiment score
    if sentiment.score > 0.5:
        return 'positive'
    elif sentiment.score < -0.5:
        return 'negative'
    else:
        return 'neutral'

# Read news data from CSV
news_df = pd.read_csv('your_file.csv')  # Replace 'your_file.csv' with the actual file path

# Apply pre-processing to news data
news_df['cleaned_content'] = news_df['content'].apply(preprocess_text)

# Apply sentiment analysis to news contents
news_df['google_sentiment'] = news_df['cleaned_content'].apply(classify_news)

# Keep only the 'title', 'content', and 'google_sentiment' columns
final_df = news_df[['title', 'content', 'google_sentiment']]

# Save the final DataFrame to a CSV file
final_df.to_csv('your_final_file_with_google_sentiment.csv', index=False)  # Replace with the desired file path

# Display the final DataFrame
print(final_df.head())


In [4]:
import pandas as pd
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Read news data from CSV
news_df = pd.read_csv('news_data1.csv', nrows=100000)  # Replace 'your_file.csv' with the actual file path

# Preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Remove special characters, punctuations (except exclamation marks) and extra whitespaces
    cleaned_text = ' '.join([word.lower() for word in tokens if word.isalnum() or word == '!'])

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in cleaned_text.split()]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in lemmatized_words if word not in stop_words]

    return ' '.join(filtered_words)

# Function to classify news sentiment using TextBlob
def classify_news(news_text):
    analysis = TextBlob(news_text)
    polarity = analysis.sentiment.polarity

    # Determine sentiment label based on polarity score
    if polarity > 0.5:
        return 'positive'
    elif polarity < -0.5:
        return 'negative'
    else:
        return 'neutral'

# Apply pre-processing to news data
news_df['cleaned_content'] = news_df['content'].apply(preprocess_text)

# Apply sentiment analysis to news contents
news_df['textblob_sentiment'] = news_df['cleaned_content'].apply(classify_news)

# Keep only the 'title', 'content', and 'textblob_sentiment' columns
final_df = news_df[['content', 'textblob_sentiment']]

# Save the final DataFrame to a CSV file
final_df.to_csv('india_textblob.csv', index=False)  # Replace with the desired file path

# Display the final DataFrame
print(final_df.head())


                                             content textblob_sentiment
0  Wednesday, March 6, 2024Favorite Today, Touris...            neutral
1  Thiruvananthapuram: In an effort to firmly sec...            neutral
2  ENGLISH MALAYALAM NEWSPAPER E-Paper More+ 900 ...            neutral
3  For nature lovers, there could be nothing more...            neutral
4  Thursday, January 11, 2024Favorite Kerala gear...            neutral


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
import pandas as pd
import nltk
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Read news data from CSV
news_df = pd.read_csv('news_data1.csv', nrows=100000)  # Replace 'your_file.csv' with the actual file path

# Preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Remove special characters, punctuations (except exclamation marks) and extra whitespaces
    cleaned_text = ' '.join([word.lower() for word in tokens if word.isalnum() or word == '!'])

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in cleaned_text.split()]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in lemmatized_words if word not in stop_words]

    return ' '.join(filtered_words)

# Apply pre-processing to news data
news_df['cleaned_content'] = news_df['content'].apply(preprocess_text)

# Load BERTweet model and tokenizer
model_name = "finiteautomata/bertweet-base-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Function to classify news sentiment using BERTweet model
def classify_news_batch(news_text_list):
    # Preprocess text and tokenize
    inputs = tokenizer(news_text_list, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Classify sentiment
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_classes = torch.argmax(logits, dim=1).tolist()

    # Convert predicted class indices to sentiment labels
    sentiment_labels = []
    for predicted_class in predicted_classes:
        if predicted_class == 0:
            sentiment_labels.append('negative')
        elif predicted_class == 1:
            sentiment_labels.append('neutral')
        else:
            sentiment_labels.append('positive')

    return sentiment_labels

# Apply sentiment analysis to news contents in batches
batch_size = 32
sentiment_labels = []
for i in range(0, len(news_df), batch_size):
    news_batch = news_df['cleaned_content'].iloc[i:i+batch_size].tolist()
    batch_sentiment_labels = classify_news_batch(news_batch)
    sentiment_labels.extend(batch_sentiment_labels)

# Add sentiment labels to the DataFrame
news_df['bertweet_sentiment'] = sentiment_labels

# Keep only the relevant columns
news_df = news_df[['content', 'bertweet_sentiment']]

# Save the DataFrame back to a CSV file
news_df.to_csv('india_bertweet.csv', index=False)

# Display the DataFrame with sentiment analysis results
print(news_df.head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/338 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


config.json:   0%|          | 0.00/949 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


                                             content bertweet_sentiment
0  Wednesday, March 6, 2024Favorite Today, Touris...            neutral
1  Thiruvananthapuram: In an effort to firmly sec...           positive
2  ENGLISH MALAYALAM NEWSPAPER E-Paper More+ 900 ...           positive
3  For nature lovers, there could be nothing more...           positive
4  Thursday, January 11, 2024Favorite Kerala gear...            neutral


In [6]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import wordnet
from nltk.corpus import sentiwordnet as swn

# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('sentiwordnet')

# Read news data from CSV
news_df = pd.read_csv('news_data1.csv', nrows=100000)  # Replace 'your_file.csv' with the actual file path

# Preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemma_tokens = [lemmatizer.lemmatize(word, pos='v') for word in filtered_tokens]

    return ' '.join(lemma_tokens)

# Apply preprocessing to news contents
news_df['cleaned_content'] = news_df['content'].apply(preprocess_text)

# Calculate TF-IDF scores
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(news_df['cleaned_content'])

# Function to calculate polarity of sentiment words using WordNet
def calculate_polarity(word):
    synsets = wordnet.synsets(word)
    if not synsets:
        return 0
    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())
    return swn_synset.pos_score() - swn_synset.neg_score()

# Function to calculate total sentiment score of news articles
def calculate_sentiment_score(text):
    tokens = word_tokenize(text)
    total_score = sum(calculate_polarity(token) for token in tokens)
    return total_score

# Calculate total sentiment score for each news article
news_df['sentiment_score'] = news_df['cleaned_content'].apply(calculate_sentiment_score)

# Classify news articles into positive, negative, and neutral based on sentiment score
def classify_sentiment(score):
    if score > 0:
        return 'positive'
    elif score < 0:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment classification
news_df['sentiment'] = news_df['sentiment_score'].apply(classify_sentiment)

# Keep only the relevant columns
news_df = news_df[['content', 'sentiment']]

# Save the DataFrame back to a CSV file
news_df.to_csv('india_sentiword.csv', index=False)

# Display the DataFrame with sentiment analysis results
print(news_df.head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.


                                             content sentiment
0  Wednesday, March 6, 2024Favorite Today, Touris...  positive
1  Thiruvananthapuram: In an effort to firmly sec...  positive
2  ENGLISH MALAYALAM NEWSPAPER E-Paper More+ 900 ...  positive
3  For nature lovers, there could be nothing more...  positive
4  Thursday, January 11, 2024Favorite Kerala gear...  positive
