In [1]:
import nltk
import pandas as pd
import numpy

file_path = 'ISIS Dataset.csv'  # Replace with the actual file path
df = pd.read_csv(file_path)


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = 'ISIS Dataset.csv'
data = pd.read_csv(file_path)

# Assuming there's a column named 'Text' that contains the text data
# and 'Label' that contains the label whether the text is extremist or not
text_data = data['cleaned_text']
labels = data['Labels']

# Preprocess the data (basic preprocessing)
# Lowercase, remove punctuation, etc.
text_data = text_data.str.lower().str.replace('[^\w\s]', ' ', regex=True)

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')

# Fit and transform the text data to create TF-IDF features
tfidf_features = tfidf_vectorizer.fit_transform(text_data)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, labels, test_size=0.2, random_state=42)

# X_train and y_train can be used to train a machine learning model
# X_test and y_test can be used to evaluate the model



In [4]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Load the dataset
df = pd.read_csv('ISIS Dataset.csv')

# Define features (text data) and target variable
X = df['cleaned_text']
y = df['Labels']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a TF-IDF vectorizer to convert text data into numerical features
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')

# Define a SVM classifier
svm_classifier = SVC(kernel='linear')

# Create a pipeline combining the TF-IDF vectorizer and the SVM classifier
pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('classifier', svm_classifier)
])

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'tfidf__max_features': [10000, 20000, None],
    'classifier__C': [0.1, 1, 10],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Train the model with the best parameters
pipeline = grid_search.best_estimator_
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

# Save the trained pipeline to a file
joblib.dump(pipeline, 'trained_pipeline.pkl')


Best Parameters: {'classifier__C': 10, 'tfidf__max_features': 10000}
Accuracy: 0.8211
Precision: 0.5763
Recall: 0.2500
F1 Score: 0.3487
ROC AUC Score: 0.6032
Confusion Matrix:
[[549  25]
 [102  34]]


['trained_pipeline.pkl']

In [5]:
import joblib

# Load the trained pipeline
pipeline = joblib.load('trained_pipeline.pkl')  # Replace 'trained_pipeline.pkl' with the path to your trained pipeline file

def predict_extremism(content):
    # Predict whether the content is extremist or not
    prediction = pipeline.predict([content])[0]
    return 'Extremist' if prediction == 1 else 'Not Extremist'

def main():
    print("Enter the content you want to analyze (type 'exit' to quit):")
    while True:
        content = input("> ")
        if content.lower() == 'exit':
            print("Exiting...")
            break
        else:
            result = predict_extremism(content)
            print(f"Prediction: {result}")

if __name__ == "__main__":
    main()


Enter the content you want to analyze (type 'exit' to quit):
> i enjoy ad hominoid attacks as it tells me you are an intellectual midget   my apology to littel people 
Prediction: Extremist
> where s a fucking drone strike mishap when ya need one    sheesh      
Prediction: Extremist
> ask jonathan greenblatt if he disavows any of these things   if he does  lock him up in german jail for minimizing the hollow caust 
Prediction: Extremist
> ho lee fuk  chinese restaurants in canada have sum ting wong with them  they are racist   lol  
Prediction: Extremist
> exit
Exiting...


Enter the content you want to analyze (type 'exit' to quit):
