In [1]:
import pandas as pd
import numpy as np
import re
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

In [2]:
MAX_PER_CLASS = 1200000000000

def process_csv_file(path, counts):
    df = pd.read_csv(path)
    df = df[df['user_primary_language'] == 'en']
    df = df[~df['text'].astype(str).str.contains(r'\{\{.*?\}\}', regex=True)]

    x = df[[
        "text", "user_primary_language", "Bullying", "Fighting", "Sexting", "Vulgar", "Drugs",
        "InGame", "Alarm", "Fraud", "Racist", "Religion", "Junk", "Website", "Grooming",
        "PublicThreats", "RealName", "ExtremistRecruitment", "Subversive", "Sentiment", "Politics"
    ]].fillna(0)

    label_cols = x.columns.difference(['text', 'user_primary_language'])

    rows = []
    for _, row in x.iterrows():
        if all(counts[d] >= MAX_PER_CLASS for d in ['accepted', 'pending', 'blocked']):
            break

        text = row['text']
        all_zero = (row[label_cols] == 0).all()

        if all_zero:
            if counts['accepted'] < MAX_PER_CLASS:
                counts['accepted'] += 1
                rows.append({
                    'Text': text,
                    'Label': 'Nothing Wrong',
                    'Status': 0,
                    'Decision': 'accepted'
                })
        else:
            for label in label_cols:
                status = int(row[label])
                if status == 0:
                    continue

                if status in [0, 1, 2]:
                    decision = 'accepted'
                elif status in [3, 4]:
                    decision = 'pending'
                elif status in [5,6, 7]:
                    decision = 'blocked'
                else:
                    decision = 'unknown'

                if counts.get(decision, 0) >= MAX_PER_CLASS:
                    continue

                counts[decision] += 1
                rows.append({
                    'Text': text,
                    'Label': label,
                    'Status': status,
                    'Decision': decision
                })
    return rows, counts

In [3]:
def generate_summary_report(df):
    summary_data = []

    # Overall decision summary
    decision_counts = df['Decision'].value_counts().to_dict()
    summary_data.append(['Summary'])
    summary_data.append(['Overall accepted', decision_counts.get('accepted', 0)])
    summary_data.append(['Overall pending', decision_counts.get('pending', 0)])
    summary_data.append(['Overall blocked', decision_counts.get('blocked', 0)])
    summary_data.append([])

    # "Nothing Wrong"
    nothing_wrong_count = df[df['Label'] == 'Nothing Wrong'].shape[0]
    summary_data.append(['Nothing Wrong', nothing_wrong_count])
    summary_data.append([])

    # Per-label summary
    summary_data.append(['Per-label Summary', 'Accepted', 'Pending', 'Blocked'])
    label_stats = df[df['Label'] != 'Nothing Wrong'].groupby(['Label', 'Decision']).size().unstack(fill_value=0)

    for label in label_stats.index:
        accepted = label_stats.loc[label].get('accepted', 0)
        pending = label_stats.loc[label].get('pending', 0)
        blocked = label_stats.loc[label].get('blocked', 0)
        summary_data.append([label, accepted, pending, blocked])

    # Save summary
    summary_df = pd.DataFrame(summary_data)
    #summary_df.to_csv("summary_report_csv.csv", index=False, header=False, encoding="utf-8-sig")

    # Print
    print("\n📊 Summary Report:\n")
    for row in summary_data:
        print(', '.join(str(cell) for cell in row if cell != ''))


In [None]:
def file_clean_summary_for_folder(folder_path):
    combined_rows = []
    counts = {'accepted': 0, 'pending': 0, 'blocked': 0}

    for filename in os.listdir(folder_path):
        if not filename.endswith(".csv"):
            continue
        file_path = os.path.join(folder_path, filename)
        print(f"🔄 Processing: {filename}")
        rows, counts = process_csv_file(file_path, counts)
        combined_rows.extend(rows)
    
        if all(counts[d] >= MAX_PER_CLASS for d in ['accepted', 'pending', 'blocked']):
            print("✅ All classes reached their limits. Stopping early.")
            break

    final_output = pd.DataFrame(combined_rows)
    final_output.to_csv("final1_csv.csv", index=False, encoding="utf-8-sig")
    print("✅ Saved final1_csv.csv with class caps applied.")
    generate_summary_report(final_output)

file_clean_summary_for_folder(r"U:\N\new_csv")

In [1]:
import pandas as pd

res = pd.read_csv('final1_csv.csv')
print(res)
with open('final1_csv.csv', 'r', encoding='utf-8') as f:
    res = sum(1 for line in f)
print(res)


                                                     Text          Label  \
0            oop nws nws, may I ask for future preface of           Junk   
1             yeah i had to use my own like what the heck         Vulgar   
2                                              on you too  Nothing Wrong   
3                                            First letter         Vulgar   
4                                         THEY are stinky       Bullying   
...                                                   ...            ...   
3005110                                           trumpet       Politics   
3005111                                 ur rose look slay       Fighting   
3005112  NO OFFNESE BUT HE IS SO ANNOYING SOMETIMES HAHAH       Bullying   
3005113                                           Trumpet       Politics   
3005114                      we got richie rich over here       RealName   

         Status  Decision  
0             2  accepted  
1             5   blocked  
2  

In [5]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.preprocessing import LabelEncoder
# from imblearn.over_sampling import SMOTE
# import pandas as pd


# # Load final_output (skip if it's already loaded)
# final_output = pd.read_csv("final_csv0.csv")


# # Filter only the rows with meaningful labels (skip 'Nothing Wrong' if not useful)
# df_balanced_input = final_output[final_output['Label'] != 'Nothing Wrong'].copy()


# # Step 1: Vectorize the text
# vectorizer = TfidfVectorizer(max_features=5000)
# X = vectorizer.fit_transform(df_balanced_input['Text'])


# # Step 2: Encode the Decision labels
# le = LabelEncoder()
# y = le.fit_transform(df_balanced_input['Decision'])  # e.g., accepted -> 0, blocked -> 1, etc.


# # Step 3: Apply SMOTE
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X, y)


# # Step 4: Map back to original labels
# balanced_df = pd.DataFrame(X_resampled.toarray(), columns=vectorizer.get_feature_names_out())
# balanced_df['Text'] = vectorizer.inverse_transform(X_resampled)
# balanced_df['Text'] = [' '.join(text) for text in balanced_df['Text']]
# balanced_df['Decision'] = le.inverse_transform(y_resampled)


# # Optional: Merge original labels back (one way is to randomly assign corresponding labels per class)
# # For this, we'll fetch one representative label per class
# label_map = df_balanced_input.groupby('Decision')['Label'].apply(list).to_dict()


# # Randomly assign original labels from each class
# import random
# balanced_df['Label'] = balanced_df['Decision'].apply(lambda d: random.choice(label_map[d]))
# balanced_df['Status'] = balanced_df['Decision'].map({
#     'accepted': 1,
#     'pending': 4,
#     'blocked': 6
# })  # or keep it as placeholder


# # Save to new CSV
# balanced_df.to_csv("balanced_decision_csv0.csv", index=False, encoding="utf-8-sig")


# # Optional: Show class distribution
# print("\n📊 Balanced Class Distribution:")
# print(balanced_df['Decision'].value_counts())


In [6]:
# import pandas as pd
# import random
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.preprocessing import LabelEncoder
# from imblearn.over_sampling import SMOTE

# # Load and filter
# final_output = pd.read_csv("final_csv0.csv")
# df_balanced_input = final_output[final_output['Label'] != 'Nothing Wrong'].copy()

# # Vectorize
# vectorizer = TfidfVectorizer(max_features=5000)
# X = vectorizer.fit_transform(df_balanced_input['Text'])

# # Encode labels
# le = LabelEncoder()
# y = le.fit_transform(df_balanced_input['Decision'])

# # SMOTE
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X, y)

# # Map y back to text decisions
# y_labels = le.inverse_transform(y_resampled)

# # Group original rows (Text, Label, Status) per Decision
# decision_to_rows = df_balanced_input.groupby('Decision')[['Text', 'Label', 'Status']].apply(lambda g: g.to_dict('records')).to_dict()

# # Generate final balanced rows
# balanced_rows = []
# for decision in y_labels:
#     row = random.choice(decision_to_rows[decision])  # sample actual row
#     balanced_rows.append({
#         'Text': row['Text'],
#         'Label': row['Label'],
#         'Status': row['Status'],
#         'Decision': decision
#     })

# # Save final readable CSV
# balanced_final_df = pd.DataFrame(balanced_rows)
# balanced_final_df.to_csv("balanced_final_csv0.csv", index=False, encoding="utf-8-sig")

# print("✅ Saved as balanced_final_csv0.csv")


In [7]:
# import pandas as pd

# # Load the balanced data if not already in memory
# # balanced_final_df = pd.read_csv("balanced_final_csv0.csv")  # Uncomment if needed

# summary_data = []

# # 📌 Overall decision summary
# decision_counts = balanced_final_df['Decision'].value_counts().to_dict()
# summary_data.append(['Summary'])
# summary_data.append(['Overall accepted', decision_counts.get('accepted', 0)])
# summary_data.append(['Overall pending', decision_counts.get('pending', 0)])
# summary_data.append(['Overall blocked', decision_counts.get('blocked', 0)])
# summary_data.append([])  # Blank row

# # 📌 Count of "Nothing Wrong" (just for completeness; should be 0)
# nothing_wrong_count = balanced_final_df[balanced_final_df['Label'] == 'Nothing Wrong'].shape[0]
# summary_data.append(['Nothing Wrong', nothing_wrong_count])
# summary_data.append([])

# # 📌 Per-label summary
# summary_data.append(['Per-label Summary', 'Accepted', 'Pending', 'Blocked'])

# # Group and summarize
# label_stats = (
#     balanced_final_df[balanced_final_df['Label'] != 'Nothing Wrong']
#     .groupby(['Label', 'Decision'])
#     .size()
#     .unstack(fill_value=0)
# )

# for label in label_stats.index:
#     accepted = label_stats.loc[label].get('accepted', 0)
#     pending = label_stats.loc[label].get('pending', 0)
#     blocked = label_stats.loc[label].get('blocked', 0)
#     summary_data.append([label, accepted, pending, blocked])

# # Save summary to CSV
# summary_df = pd.DataFrame(summary_data)
# summary_df.to_csv("balanced_summary_report_csv0.csv", index=False, header=False, encoding="utf-8-sig")

# # Print in console
# print("\n📊 Balanced Summary Report:\n")
# for row in summary_data:
#     print(', '.join(str(cell) for cell in row if cell != ''))
