In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Task 1

#### Creating a simple content classifier using “Top 15 Anime And K-Drama Like True Beauty Webtoon” as inspiration.

Downloaded a "Webtoon Dataset.csv" from Kaggle for this project.

In [2]:
df = pd.read_csv('Webtoon Dataset.csv')
df.head()

Unnamed: 0,id,Name,Writer,Likes,Genre,Rating,Subscribers,Summary,Update,Reading Link
0,0,Let's Play,Leeanne M. Krecic (Mongie),30.6M,Romance,9.62,4.2M,"She's young, single and about to achieve her d...",UP EVERY TUESDAY,https://www.webtoons.com/en/romance/letsplay/l...
1,1,True Beauty,Yaongyi,39.9M,Romance,9.6,6.4M,"After binge-watching beauty videos online, a s...",UP EVERY WEDNESDAY,https://www.webtoons.com/en/romance/truebeauty...
2,2,Midnight Poppy Land,Lilydusk,10.4M,Romance,9.81,2.1M,After making a grisly discovery in the country...,UP EVERY SATURDAY,https://www.webtoons.com/en/romance/midnight-p...
3,3,Age Matters,Enjelicious,25.9M,Romance,9.79,3.5M,She's a hopeless romantic who's turning 30's ...,UP EVERY WEDNESDAY,https://www.webtoons.com/en/romance/age-matter...
4,4,Unholy Blood,Lina Im / Jeonghyeon Kim,9.9M,Supernatural,9.85,1.5M,When vampires destroy her chance to have the n...,UP EVERY THURSDAY,https://www.webtoons.com/en/supernatural/unhol...


In [3]:
df = df[['Summary', 'Genre']]

In [4]:
df.dropna(inplace=True)

In [5]:
df.head()

Unnamed: 0,Summary,Genre
0,"She's young, single and about to achieve her d...",Romance
1,"After binge-watching beauty videos online, a s...",Romance
2,After making a grisly discovery in the country...,Romance
3,She's a hopeless romantic who's turning 30's ...,Romance
4,When vampires destroy her chance to have the n...,Supernatural


In [6]:
df.tail()

Unnamed: 0,Summary,Genre
564,"Life's funny. One minute you're jobless, deep ...",Supernatural
565,"""Adamsville"" is an all ages series about two m...",Mystery
566,Get your WEBTOON news here!,Informative
567,A series of tips for staying safe during the C...,Informative
568,The mysterious team that brings Webtoon to you...,Informative


In [7]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X = tfidf.fit_transform(df['Summary'])
y = df['Genre']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [10]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.24561403508771928
Classification Report:
                precision    recall  f1-score   support

       Action       1.00      0.25      0.40         4
       Comedy       0.00      0.00      0.00        12
        Drama       0.00      0.00      0.00        16
      Fantasy       0.21      0.88      0.34        17
 Heartwarming       0.00      0.00      0.00         1
       Horror       0.00      0.00      0.00         6
  Informative       0.00      0.00      0.00         2
      Mystery       0.00      0.00      0.00         4
      Romance       0.30      0.63      0.41        19
       Sci-fi       0.00      0.00      0.00         4
Slice of life       0.00      0.00      0.00         6
       Sports       0.00      0.00      0.00         2
    Superhero       0.00      0.00      0.00         3
 Supernatural       0.00      0.00      0.00         9
     Thriller       0.00      0.00      0.00         9

     accuracy                           0.25       114
    macro

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### The accuracy struggles stem from the dataset's complexity and imbalance. Genres like "Fantasy" and "Romance" have ample data, while others like "Heartwarming" or "Informative" lack sufficient samples for the model to learn effectively. This disparity hampers the classifier's ability to predict accurately across all categories.

#### Additionally, text classification inherently involves nuances; word choice and context vary widely, making it a challenging task, especially with limited data. Using more balanced and extensive datasets, or perhaps more advanced techniques like deep learning, could improve results.

#### Essentially, the model needs more and better data to truly understand and differentiate between all those genres.

In [11]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.2807017543859649
Classification Report:
                precision    recall  f1-score   support

       Action       0.18      0.50      0.27         4
       Comedy       0.11      0.08      0.10        12
        Drama       0.31      0.31      0.31        16
      Fantasy       0.36      0.53      0.43        17
 Heartwarming       0.00      0.00      0.00         1
       Horror       0.33      0.33      0.33         6
  Informative       0.00      0.00      0.00         2
      Mystery       0.00      0.00      0.00         4
      Romance       0.42      0.42      0.42        19
       Sci-fi       0.67      0.50      0.57         4
Slice of life       0.00      0.00      0.00         6
       Sports       0.00      0.00      0.00         2
    Superhero       1.00      0.67      0.80         3
 Supernatural       0.00      0.00      0.00         9
     Thriller       0.20      0.11      0.14         9

     accuracy                           0.28       114
    macro 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Task 2

#### Performing a basic sentiment analysis on user comments for “The Difference Between Manga And Manhwa”.

In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from textblob import TextBlob

In [13]:
url = 'https://animemangatoon.com/difference-between-manga-and-manhwa-webtoon/'

In [14]:
response = requests.get(url)

In [15]:
soup = BeautifulSoup(response.content, 'html.parser')

In [16]:
comments = soup.select('div#div-comment-3.comment-body')

In [17]:
comment_texts = [comment.get_text(strip=True) for comment in comments]

In [18]:
comm = pd.DataFrame(comment_texts, columns=['comment'])

### There is only one comment on the web page.

In [19]:
print(comm.head())

                                             comment
0  Anvitasays:May 22, 2024 at 18:54One of the bes...


In [20]:
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

comm['sentiment'] = comm['comment'].apply(get_sentiment)

def categorize_sentiment(polarity):
    if polarity > 0:
        return 'Positive'
    elif polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

comm['sentiment_category'] = comm['sentiment'].apply(categorize_sentiment)

summary = comm['sentiment_category'].value_counts(normalize=True) * 100
print(summary)

sentiment_category
Positive    100.0
Name: proportion, dtype: float64


# Task 3

#### Building a basic chatbot using “Castle Swimmer Chapter 83-89: Unveiling New Prophecy”.

In [21]:
import requests
from bs4 import BeautifulSoup

url = 'https://animemangatoon.com/castle-swimmer-unveiling-new-prophecy/'

response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

article_content = soup.find('div', class_='content-inner').get_text(strip=True)

full_article = article_content

In [22]:
import nltk

nltk.download('punkt')
print(nltk.data.path)


['C:\\Users\\hp/nltk_data', 'e:\\PROJECTS\\animemangatoon\\Anime-Webtoon-Analysis-Chatbot\\myenv\\nltk_data', 'e:\\PROJECTS\\animemangatoon\\Anime-Webtoon-Analysis-Chatbot\\myenv\\share\\nltk_data', 'e:\\PROJECTS\\animemangatoon\\Anime-Webtoon-Analysis-Chatbot\\myenv\\lib\\nltk_data', 'C:\\Users\\hp\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
import re

responses = {
    "about": "Castle Swimmer is a fantasy webtoon about two sea creatures, Kappa and Siren, who are bound by a prophecy.",
    "main characters": "The main characters are Kappa, the Beacon, and Siren, the prince of sharks.",
    "kappa": "Kappa is one of the main characters, known as the Beacon.",
    "siren": "Siren is the prince of sharks and one of the main characters.",
    "prophecy": "The prophecy foretells that Kappa, the Beacon, will lead his people to a prosperous future, while Siren is destined to kill Kappa to ensure his own people's survival.",
    "season 2": "Season 2 delves deeper into the mystery and intrigue, with Siren's curse being a central theme.",
    "chapter 83": "Chapter 83 begins with a shocking revelation about Siren's curse and his role as the savior turning into a fatal trap.",
    "chapter 84": "Chapter 84 reveals that a living mini-god must sustain a curse, adding a new layer to the story.",
    "curse": "The curse is a significant theme, particularly for Siren as it affects his destiny and interactions with Kappa.",
}

def chatbot_response(user_input):
    user_input = user_input.lower()
    
    for keyword in responses:
        if re.search(r'\b' + re.escape(keyword) + r'\b', user_input):
            return responses[keyword]
    
    return "I'm not sure how to answer that. Can you ask something else?"



In [24]:
print(chatbot_response("What is Castle Swimmer about?"))

Castle Swimmer is a fantasy webtoon about two sea creatures, Kappa and Siren, who are bound by a prophecy.


In [25]:
print(chatbot_response("Who are the main characters?"))

The main characters are Kappa, the Beacon, and Siren, the prince of sharks.


In [26]:
print(chatbot_response("Tell me about Chapter 84."))

Castle Swimmer is a fantasy webtoon about two sea creatures, Kappa and Siren, who are bound by a prophecy.


In [27]:
print(chatbot_response("What is the prophecy?"))

The prophecy foretells that Kappa, the Beacon, will lead his people to a prosperous future, while Siren is destined to kill Kappa to ensure his own people's survival.


In [28]:
print(chatbot_response("Who is Siren?"))

Siren is the prince of sharks and one of the main characters.
