In [3]:
import pandas as pd
import re
import emoji
import contractions
import json
import nltk
from nltk.corpus import words

nltk.download('words')
standard_words = set(words.words())

# Load slang dictionary
with open('resource/slang.json', 'r', encoding='utf-8') as f:
    slang_dict = json.load(f)

def preprocess_twitter_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Drop unnecessary columns
    drop_cols = ['Unnamed: 0', 'post_id', 'post_created', 'user_id', 
                 'followers', 'friends', 'favourites', 'statuses', 'retweets']
    df = df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore')

    # Drop duplicates
    df = df.drop_duplicates(keep=False)

    # Step 1: Remove URLs and mark presence
    def remove_urls(text):
        pattern = r'http\S+|www\S+'
        return re.sub(pattern, '', text), int(bool(re.search(pattern, text)))
    df[['post_text', 'URLs']] = df['post_text'].apply(lambda x: pd.Series(remove_urls(x)))

    # Step 2: Remove Mentions and mark presence
    def remove_mentions(text):
        pattern = r'@\w+'
        return re.sub(pattern, '', text), int(bool(re.search(pattern, text)))
    df[['post_text', 'Mentions']] = df['post_text'].apply(lambda x: pd.Series(remove_mentions(x)))

    # Step 3: Extract Hashtags and remove from text
    def extract_hashtags(text):
        return re.findall(r'#\w+', text)
    df['Hashtags'] = df['post_text'].apply(extract_hashtags)
    df['post_text'] = df['post_text'].apply(lambda x: re.sub(r'#\w+', '', x))

    # Step 4: Convert emojis
    def convert_emojis(text):
        text = emoji.demojize(text)
        text = re.sub(r':([a-zA-Z_]+):', r' \1 ', text)
        return re.sub(r'\s+', ' ', text.replace('_', ' ')).strip()
    df['post_text'] = df['post_text'].apply(convert_emojis)

    # Step 5: Expand contractions
    df['post_text'] = df['post_text'].apply(contractions.fix)

    # Step 6: Remove special characters and lowercase
    df['post_text'] = df['post_text'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))

    # Step 7: Normalize slang
    def normalize_slang(text):
        return ' '.join([slang_dict.get(w, w) if w.lower() not in standard_words else w 
                         for w in text.split()])
    df['post_text'] = df['post_text'].apply(normalize_slang)

    # Step 8: One-hot encode hashtags
    all_hashtags = list(set(h for tags in df['Hashtags'] for h in tags))
    for tag in all_hashtags:
        clean_tag = tag[1:]  # remove '#'
        df[f'hashtag_{clean_tag}'] = df['Hashtags'].apply(lambda tags: int(tag in tags))

    # Drop Hashtags column
    df = df.drop(columns=['Hashtags'])

    # Ensure label is last column if it exists
    if 'label' in df.columns:
        label_col = df.pop('label')
        df['label'] = label_col

    return df


[nltk_data] Downloading package words to
[nltk_data]     /Users/nanphattongsirisukool/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [4]:
from sklearn.utils import resample

def undersample(df, label_col='label', random_state=42):
    # Separate majority and minority classes
    df_majority = df[df[label_col] == 0]
    df_minority = df[df[label_col] == 1]

    # Downsample majority class to match minority class
    df_majority_downsampled = resample(
        df_majority,
        replace=False,  # without replacement
        n_samples=len(df_minority),
        random_state=random_state
    )

    # Combine minority class with downsampled majority class
    df_balanced = pd.concat([df_majority_downsampled, df_minority])

    # Shuffle the result
    df_balanced = df_balanced.sample(frac=1, random_state=random_state).reset_index(drop=True)

    return df_balanced


In [5]:
import pandas as pd

# Read the CSV file and select columns 2 and 3 (0-based index)
df = pd.read_csv('twitter_training.csv').iloc[:,2:4]

# Set the column names to 'label' and 'post_text'
df.columns = ['label', 'post_text']

# Drop the 'label' column
df['label'] = df.pop('label')

target = 'Negative'
df['label'] = (df['label'] == target).astype(int)

# Display the resulting DataFrame
# df[df['label'] == 1]

# df = undersample(df)
df

Unnamed: 0,post_text,label
0,I am coming to the borders and I will kill you...,0
1,im getting on borderlands and i will kill you ...,0
2,im coming on borderlands and i will murder you...,0
3,im getting on borderlands 2 and i will murder ...,0
4,im getting into borderlands and i can murder y...,0
...,...,...
74676,Just realized that the Windows partition of my...,0
74677,Just realized that my Mac window partition is ...,0
74678,Just realized the windows partition of my Mac ...,0
74679,Just realized between the windows partition of...,0


In [6]:
df_cleaned = preprocess_twitter_df(df)

In [7]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer

# Ensure required resources are available
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Step 1: Tokenize
    df['tokens'] = df['post_text'].apply(lambda x: word_tokenize(str(x).lower()))

    # Step 2: Remove stopwords
    df['processed_tokens'] = df['tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

    # Step 3: Lemmatize with POS tagging
    def get_wordnet_pos(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def lemmatize_tokens(tokens):
        tagged = pos_tag(tokens)
        return [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in tagged]

    df['processed_tokens'] = df['processed_tokens'].apply(lemmatize_tokens)
    
    # Drop columns 'post_text' and 'tokens'
    df = df.drop(['post_text', 'tokens'], axis=1)
    # print(df)

    # Reorder label to last column if exists
    if 'label' in df.columns:
        label = df.pop('label')
        df['label'] = label

    return df


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nanphattongsirisukool/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nanphattongsirisukool/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nanphattongsirisukool/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nanphattongsirisukool/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
df_tokenized = tokenize_and_lemmatize(df_cleaned)
df_tokenized.head()

Unnamed: 0,URLs,Mentions,processed_tokens,label
0,0,0,"[come, border, kill]",0
1,0,0,"[get, borderland, kill]",0
2,0,0,"[come, borderland, murder]",0
3,0,0,"[get, borderland, 2, murder]",0
4,0,0,"[get, borderland, murder]",0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

def train_model(df_tokenized):
    df = df_tokenized.copy()

    # 1. Prepare features and target
    X = df['processed_tokens'].apply(lambda x: ' '.join(eval(x) if isinstance(x, str) else x))
    y = df['label']

    # 2. Split into train/val/test (80/10/10 split)
    df_train, df_temp = train_test_split(df, test_size=0.3, stratify=y, random_state=42)
    df_val, df_test = train_test_split(df_temp, test_size=0.5, stratify=df_temp['label'], random_state=42)

    X_train = df_train['processed_tokens'].apply(lambda x: ' '.join(eval(x) if isinstance(x, str) else x))
    X_val = df_val['processed_tokens'].apply(lambda x: ' '.join(eval(x) if isinstance(x, str) else x))
    X_test = df_test['processed_tokens'].apply(lambda x: ' '.join(eval(x) if isinstance(x, str) else x))

    y_train = df_train['label']
    y_val = df_val['label']
    y_test = df_test['label']

    # 3. TF-IDF vectorization
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_val_tfidf = vectorizer.transform(X_val)
    X_test_tfidf = vectorizer.transform(X_test)

    # 4. Train the model (no numeric features)
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train_tfidf, y_train)

    # 5. Validation report
    print("Validation Set Evaluation:")
    y_val_pred = model.predict(X_val_tfidf)
    print(classification_report(y_val, y_val_pred))

    # 6. Test report
    print("Test Set Evaluation:")
    y_test_pred = model.predict(X_test_tfidf)
    print(classification_report(y_test, y_test_pred))

    return model, vectorizer


In [93]:
model, vectorizer = train_model(df_tokenized)


Validation Set Evaluation:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      7015
           1       0.93      0.83      0.88      3096

    accuracy                           0.93     10111
   macro avg       0.93      0.90      0.92     10111
weighted avg       0.93      0.93      0.93     10111

Test Set Evaluation:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      7016
           1       0.93      0.83      0.88      3096

    accuracy                           0.93     10112
   macro avg       0.93      0.90      0.91     10112
weighted avg       0.93      0.93      0.93     10112



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

def train_model(df_tokenized):
    df = df_tokenized.copy()

    # 1. Prepare features and target
    X = df['processed_tokens'].apply(lambda x: ' '.join(eval(x) if isinstance(x, str) else x))
    y = df['label']

    # 2. Split into train/val/test (80/10/10 split)
    df_train, df_temp = train_test_split(df, test_size=0.3, stratify=y, random_state=42)
    df_val, df_test = train_test_split(df_temp, test_size=0.5, stratify=df_temp['label'], random_state=42)

    X_train = df_train['processed_tokens'].apply(lambda x: ' '.join(eval(x) if isinstance(x, str) else x))
    X_val = df_val['processed_tokens'].apply(lambda x: ' '.join(eval(x) if isinstance(x, str) else x))
    X_test = df_test['processed_tokens'].apply(lambda x: ' '.join(eval(x) if isinstance(x, str) else x))

    y_train = df_train['label']
    y_val = df_val['label']
    y_test = df_test['label']

    # 3. TF-IDF vectorization
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_val_tfidf = vectorizer.transform(X_val)
    X_test_tfidf = vectorizer.transform(X_test)

    # 4. Train the model (no numeric features)
    # model = RandomForestClassifier(random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_tfidf, y_train)

    # 5. Validation report
    print("Validation Set Evaluation:")
    y_val_pred = model.predict(X_val_tfidf)
    print(classification_report(y_val, y_val_pred))

    # 6. Test report
    print("Test Set Evaluation:")
    y_test_pred = model.predict(X_test_tfidf)
    print(classification_report(y_test, y_test_pred))

    return model, vectorizer


In [9]:
model, vectorizer = train_model(df_tokenized)


Validation Set Evaluation:
              precision    recall  f1-score   support

           0       0.84      0.93      0.88      7015
           1       0.79      0.60      0.68      3096

    accuracy                           0.83     10111
   macro avg       0.81      0.77      0.78     10111
weighted avg       0.82      0.83      0.82     10111

Test Set Evaluation:
              precision    recall  f1-score   support

           0       0.84      0.93      0.88      7016
           1       0.79      0.60      0.68      3096

    accuracy                           0.83     10112
   macro avg       0.82      0.77      0.78     10112
weighted avg       0.83      0.83      0.82     10112



In [94]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer

# Make sure these are downloaded once
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess_post(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in stop_words]
    pos_tags = pos_tag(tokens)
    lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    return ' '.join(lemmatized)

def predict_post(text, model, vectorizer):
    processed_text = preprocess_post(text)
    tfidf_features = vectorizer.transform([processed_text])
    prediction = model.predict(tfidf_features)[0]
    return prediction


In [118]:
text = input()
label = predict_post(text, model, vectorizer)
print("Predicted label:", label)


Predicted label: 1


In [111]:
import pandas as pd

target = 'Positive'

df = pd.read_csv('twitter_training.csv').iloc[:,2:4]

df.columns = ['label', 'post_text']

df['label'] = df.pop('label')

df['label'] = (df['label'] == target).astype(int)

df_cleaned = preprocess_twitter_df(df)

df_tokenized = tokenize_and_lemmatize(df_cleaned)

model, vectorizer = train_model(df_tokenized)


Validation Set Evaluation:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7360
           1       0.92      0.78      0.84      2753

    accuracy                           0.92     10113
   macro avg       0.92      0.88      0.89     10113
weighted avg       0.92      0.92      0.92     10113

Test Set Evaluation:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7361
           1       0.92      0.77      0.84      2752

    accuracy                           0.92     10113
   macro avg       0.92      0.87      0.89     10113
weighted avg       0.92      0.92      0.92     10113



In [117]:
text = input()
label = predict_post(text, model, vectorizer)
print("Predicted label:", label)


Predicted label: 1


In [122]:
text = "I love you"
label = predict_post(text, model, vectorizer)
print("Predicted label:", label)
text = "I hate you"
label = predict_post(text, model, vectorizer)
print("Predicted label:", label)
text = "you look great"
label = predict_post(text, model, vectorizer)
print("Predicted label:", label)
text = "Fucking weird"
label = predict_post(text, model, vectorizer)
print("Predicted label:", label)
text = "you look pretty"
label = predict_post(text, model, vectorizer)
print("Predicted label:", label)

Predicted label: 1
Predicted label: 0
Predicted label: 1
Predicted label: 0
Predicted label: 1


# Model of new dataset validate old dataset

In [None]:
import pandas as pd

# target = 'Positive'

df = pd.read_csv('twitter_training.csv').iloc[:,2:4]

# df.columns = ['label', 'post_text']

# df['label'] = df.pop('label')

# df['label'] = (df['label'] == target).astype(int)

df_cleaned = preprocess_twitter_df(df)

df_tokenized = tokenize_and_lemmatize(df_cleaned)

model, vectorizer = train_model(df_tokenized)
