In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# 📚 Import libraries
import pandas as pd

# 📁 Load the training data
train_df = pd.read_csv("train.csv")
print("Training Data Loaded ✅")
print("Shape of train data:", train_df.shape)
print(train_df.head())

# 📁 Load the test data
test_df = pd.read_csv("test.csv")
print("\nTest Data Loaded ✅")
print("Shape of test data:", test_df.shape)
print(test_df.head())

# 📁 Load the test labels (true labels for test set, available after competition ended)
test_labels = pd.read_csv("test_labels.csv")
print("\nTest Labels Loaded ✅")
print("Shape of test labels:", test_labels.shape)
print(test_labels.head())

# 📁 Load sample submission (used during competition for submission format)
sample_submission = pd.read_csv("sample_submission.csv")
print("\nSample Submission Loaded ✅")
print("Shape of sample submission:", sample_submission.shape)
print(sample_submission.head())


Training Data Loaded ✅
Shape of train data: (159571, 8)
                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  

Test Data Loaded ✅
Shape of test data: (153164, 2)
                 id                                      

# Cleaning the text

In [3]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tqdm import tqdm  # for progress bar

# Download stopwords once
nltk.download('stopwords')

# 📁 Load the dataset (make sure it's already loaded as train_df)
train_df = pd.read_csv('train.csv')

# 🧠 Initialize preprocessing tools
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
if 'not' in all_stopwords:
    all_stopwords.remove('not')  # Keep "not" for sentiment context

# 🧹 Clean the comments
corpus = []
for comment in tqdm(train_df['comment_text'].fillna("")[:10000]):  # Limit for quick testing
    review = re.sub('[^a-zA-Z]', ' ', comment)  # Remove non-alphabetic characters
    review = review.lower().split()
    review = [ps.stem(word) for word in review if word not in all_stopwords]
    corpus.append(' '.join(review))

# 🖨️ Show a few cleaned comments
print("\nSample Cleaned Comments:")
for i in range(5):
    print(f"{i+1}: {corpus[i]}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
100%|██████████| 10000/10000 [00:09<00:00, 1076.80it/s]


Sample Cleaned Comments:
1: explan edit made usernam hardcor metallica fan revert vandal closur ga vote new york doll fac pleas remov templat talk page sinc retir
2: aww match background colour seemingli stuck thank talk januari utc
3: hey man realli not tri edit war guy constantli remov relev inform talk edit instead talk page seem care format actual info
4: make real suggest improv wonder section statist later subsect type accid think refer may need tidi exact format ie date format etc later one els first prefer format style refer want pleas let know appear backlog articl review guess may delay review turn list relev form eg wikipedia good articl nomin transport
5: sir hero chanc rememb page





In [4]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download stopwords if not already
nltk.download('stopwords')

# Load your data (if not already)
train_df = pd.read_csv("train.csv")  # Or use your variable if already loaded

# Define the cleaning function
def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', str(text))  # Clean non-alphabetic characters
    text = text.lower()
    words = text.split()

    ps = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    stop_words.discard('not')  # Retain "not" for sentiment

    # Handle potential recursion error in PorterStemmer
    cleaned_words = []
    for word in words:
        if word not in stop_words:
            try:
                cleaned_words.append(ps.stem(word))
            except RecursionError:
                cleaned_words.append(word)  # Keep the original word if stemming fails

    return ' '.join(cleaned_words)

# Apply the function to the comment_text column
train_df['cleaned_text'] = train_df['comment_text'].apply(clean_text)

# View cleaned examples
print(train_df[['comment_text', 'cleaned_text']].head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                        comment_text  \
0  Explanation\nWhy the edits made under my usern...   
1  D'aww! He matches this background colour I'm s...   
2  Hey man, I'm really not trying to edit war. It...   
3  "\nMore\nI can't make any real suggestions on ...   
4  You, sir, are my hero. Any chance you remember...   

                                        cleaned_text  
0  explan edit made usernam hardcor metallica fan...  
1  aww match background colour seemingli stuck th...  
2  hey man realli not tri edit war guy constantli...  
3  make real suggest improv wonder section statis...  
4                         sir hero chanc rememb page  


In [5]:
print(corpus)

['explan edit made usernam hardcor metallica fan revert vandal closur ga vote new york doll fac pleas remov templat talk page sinc retir', 'aww match background colour seemingli stuck thank talk januari utc', 'hey man realli not tri edit war guy constantli remov relev inform talk edit instead talk page seem care format actual info', 'make real suggest improv wonder section statist later subsect type accid think refer may need tidi exact format ie date format etc later one els first prefer format style refer want pleas let know appear backlog articl review guess may delay review turn list relev form eg wikipedia good articl nomin transport', 'sir hero chanc rememb page', 'congratul well use tool well talk', 'cocksuck piss around work', 'vandal matt shirvington articl revert pleas ban', 'sorri word nonsens offens anyway not intend write anyth articl wow would jump vandal mere request encycloped one use school refer select breed page almost stub point anim breed short messi articl give in

# Creating the bag of words model

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Apply TF-IDF Vectorization on the cleaned text column
tfidf = TfidfVectorizer(max_features=1500, ngram_range=(1, 2))  # Use unigram and bigram
X = tfidf.fit_transform(train_df['cleaned_text']).toarray()

# Target labels (assuming 'toxic' is the target column; change if necessary)
y = train_df['toxic'].values  # or whatever the target column name is


# splitting the data

In [7]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=42)

# train the Logistic Regression

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


# 3. Best Classical Model: Random Forest
# model = RandomForestClassifier(n_estimators=300, random_state=42)
# model.fit(X_train, y_train)


# predicting the test result

In [9]:
y_pred = model.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [1 1]
 [0 0]]


# confusion matrix


In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm=confusion_matrix(y_test,y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[28631   228]
 [ 1214  1842]]


0.9548174839417202

In [11]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.96      0.99      0.98     28859
           1       0.89      0.60      0.72      3056

    accuracy                           0.95     31915
   macro avg       0.92      0.80      0.85     31915
weighted avg       0.95      0.95      0.95     31915



In [32]:
!pip install gradio
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Assuming you have these from your training
# model = your trained LogisticRegression
# vectorizer = your fitted TfidfVectorizer

# Save both to Colab (will download to your machine later)
joblib.dump(model, 'toxic_model.joblib')
joblib.dump(tfidf, 'toxic_vectorizer.joblib') # Changed 'vectorizer' to 'tfidf'

print("Model and vectorizer saved!")

Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)
  Downloading safehttpx-0.1.6-py3-none-any.whl.metadata (4.2 kB)
Collecting semantic-version~=2.0

In [41]:
import gradio as gr
import joblib

# Dark theme CSS with white text
custom_css = """
:root {
    --dark-bg: #1a1a1a;
    --darker-bg: #121212;
    --card-bg: #242424;
    --toxic-red: #ff4d4d;
    --safe-green: #4dff88;
    --accent-blue: #4da6ff;
}

.gradio-container {
    background: var(--dark-bg) !important;
    color: white !important;
    border-radius: 12px !important;
    padding: 20px !important;
    max-width: 800px !important;
    margin: auto !important;
    font-family: 'Segoe UI', Arial, sans-serif !important;
}

.header {
    text-align: center;
    color: white !important;
    margin-bottom: 25px !important;
}

.input-box {
    background: var(--card-bg) !important;
    color: white !important;
    border: 1px solid #444 !important;
    border-radius: 8px !important;
}

.output {
    background: var(--card-bg) !important;
    padding: 15px !important;
    border-radius: 10px !important;
    border: 1px solid #444 !important;
    color: white !important;
}

.toxic {
    color: var(--toxic-red) !important;
    font-weight: bold !important;
}

.not-toxic {
    color: var(--safe-green) !important;
    font-weight: bold !important;
}

.confidence {
    color: white !important;
    font-size: 1.1em !important;
}

.progress-bar {
    height: 10px;
    border-radius: 5px;
    margin: 12px 0;
    background-color: #333 !important;
}

button {
    background: var(--accent-blue) !important;
    color: white !important;
    border: none !important;
    border-radius: 6px !important;
    padding: 8px 16px !important;
}

button:hover {
    opacity: 0.9 !important;
}

.markdown {
    color: #e0e0e0 !important;
}

.footer {
    color: #aaa !important;
    font-size: 0.85em;
    margin-top: 25px;
    text-align: center;
}

.examples {
    border: 1px solid #444 !important;
    border-radius: 8px !important;
}
"""

# Load model and vectorizer
model = joblib.load('toxic_model.joblib')
vectorizer = joblib.load('toxic_vectorizer.joblib')

def predict_toxicity(text):
    text_clean = text.lower().strip()
    text_vectorized = vectorizer.transform([text_clean])
    proba = model.predict_proba(text_vectorized)[0]
    prediction = model.predict(text_vectorized)[0]
    toxic_prob = proba[1] if prediction else proba[0]

    confidence_bar = f"""
    <div class='progress-bar'>
        <div style='width:{toxic_prob*100}%; height:100%;
             border-radius:5px;
             background:{'var(--toxic-red)' if prediction else 'var(--safe-green)'}'></div>
    </div>
    """

    return {
        "label": "🚨 TOXIC" if prediction else "✅ SAFE",
        "confidence": f"{toxic_prob:.1%}",
        "bar": confidence_bar,
        "explanation": "This contains harmful language patterns." if prediction else "This appears to be respectful communication.",
        "class": "toxic" if prediction else "not-toxic"
    }

with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
    with gr.Column():
        gr.Markdown("""
        <div class='header'>
        <h2>🚩 Toxic Comment Classifier</h2>
        <p>AI model detecting harmful language with 95% accuracy</p>
        </div>
        """)

        with gr.Column():
            input_box = gr.Textbox(
                label="Enter your comment",
                placeholder="Type something to analyze...",
                lines=3,
                elem_classes="input-box"
            )

            with gr.Row():
                submit_btn = gr.Button("Analyze", variant="primary")
                clear_btn = gr.Button("Clear", variant="secondary")

        with gr.Column():
            output_label = gr.Label(label="Analysis Result", elem_classes="output")
            gr.Markdown("Confidence Level:", elem_classes="markdown")
            confidence_bar = gr.HTML()
            confidence_score = gr.Markdown(elem_classes="confidence")
            output_explanation = gr.Markdown(elem_classes="markdown")

        gr.Examples(
            examples=[
                ["You're worthless and should disappear!"],
                ["I disagree but respect your right to an opinion"],
                ["This was really helpful, thank you!"],
                 ["You're an idiot and worthless!"],
            ],
            inputs=input_box,
            label="Try these examples:",
            # elem_classes="examples"
        )

        gr.Markdown("""
        <div class='footer'>
        Powered by Logistic Regression (95% accuracy) | Note: AI models may have false positives/negatives
        </div>
        """)

    def analyze_text(text):
        result = predict_toxicity(text)
        return [
            result["label"],
            result["bar"],
            f"🔍 Confidence: <span class='confidence'>{result['confidence']}</span>",
            f"📝 <span style='color:#e0e0e0'>{result['explanation']}</span>"
        ]

    submit_btn.click(
        fn=analyze_text,
        inputs=input_box,
        outputs=[output_label, confidence_bar, confidence_score, output_explanation]
    )

    clear_btn.click(
        fn=lambda: ["", "", "", ""],
        outputs=[input_box, output_label, confidence_score, output_explanation]
    )

demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d77a9228c49dc83763.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


