# Imports


In [1]:
import pandas as pd
import nltk
import re
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import recall_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics  import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

# Select Data

In [2]:
df=pd.read_csv("Onlinesafety.csv", sep=',', header=0)

#Removing duplicates
df=df.drop_duplicates(subset=None, keep='first', inplace=False)

X=df['tweet']
Y=df['class']

# Data Processing 

In [3]:
#Removing NaN instance
X=X.dropna()
Y=Y.dropna()

    
#1----------------------------------Normalisation Lower case
    
X=X.str.lower()
        
#2----------------------------------Elimination des Usernames
        
def RemoveUsername(tweet):
    x=re.sub('@[^\s]+','',tweet)
    return x
        
X=X.apply(lambda x: RemoveUsername(x))
    
#3----------------------------------Elimination des Hashtags
    
def RemoveHashtags(tweet):
    x=re.sub('\#[^\s]+','',tweet)
    return x

X=X.apply(lambda x: RemoveHashtags(x))
        
#4---------------------------------Elimination des URL's
        
def RemoveUrl(tweet):
    x=re.sub('http\S+','',tweet)
    return x
    
X=X.apply(lambda x: RemoveUrl(x))
        
#5--------------------------------Elimination de la ponctuation et caracteres speciaux 
        
def RemovePunc(tweet):
    x=re.sub('[^\w\D\s]','',tweet)
    return x

X=X.apply(lambda x: RemovePunc(x))


#6------------------------------Lemmatisation 
        
lemmatizer = WordNetLemmatizer()

X=X.apply(lambda x: lemmatizer.lemmatize(x))


# Split Data

In [4]:
X_train, X_test, y_train,y_test= train_test_split(X,Y,test_size=0.20,random_state=42)

X_train,X_val, y_train, y_val =train_test_split(X_train,y_train,test_size=0.25,random_state=42)

# Vectors

In [5]:
vectorizer = TfidfVectorizer(max_features=4000, stop_words=stopwords.words('english'), ngram_range=(1,2))
X_train = vectorizer.fit_transform(X_train).toarray()
X_val = vectorizer.transform(X_val).toarray()
X_test= vectorizer.transform(X_test).toarray()

# Classification

In [6]:
#Training
model=MultinomialNB()
model.fit(X_train,y_train)

#Predicting 
pred=model.predict(X_val)


#Evaluation de l'algorithme

print(accuracy_score(y_val,pred))

0.8456727859592496


In [7]:
#Training
model=BernoulliNB()
model.fit(X_train,y_train)

#Predicting 
pred=model.predict(X_val)


#Evaluation de l'algorithme

print(accuracy_score(y_val,pred))

0.8813798668549526


In [8]:
#Training
model=DecisionTreeClassifier()
model.fit(X_train,y_train)

#Predicting 
pred=model.predict(X_val)


#Evaluation de l'algorithme

print(accuracy_score(y_val,pred))

0.861206374823482


In [9]:
#Training
model=RandomForestClassifier()
model.fit(X_train,y_train)

#Predicting 
pred=model.predict(X_val)


#Evaluation de l'algorithme

print(accuracy_score(y_val,pred))

0.8884405890659673


In [10]:
#Training
model=SVC()
model.fit(X_train,y_train)

#Predicting 
pred=model.predict(X_val)


#Evaluation de l'algorithme

print(accuracy_score(y_val,pred))

0.8894492636675408


## GridSearch

In [11]:
def hyper_para(params,score,file):
    file.write("Best Params: "+str(params))

### SVM

In [None]:
res=open("SVC_hyperpar.txt","w")
        
grid_params={
    'C': [0.1, 1, 10, 100, 1000],  
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
    'kernel': ['rbf']
                }

Gs=GridSearchCV(
    SVC(),
    grid_params,
    verbose=0,
    cv=5,
    n_jobs=-1,
    scoring='accuracy'
               )

grid_result = Gs.fit(X_train, y_train)

best_params = grid_result.best_params_
best_score = grid_result.best_score_
hyper_para(best_params,best_score,res)
                           

res.close()



### RandomForest

In [None]:
res=open("RF_hyperpar.txt","w")

        
grid_params={
    'max_features': ['auto', 'sqrt'],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [200, 300, 400]
                }

Gs=GridSearchCV(
    RandomForestClassifier(),
    grid_params,
    verbose=0,
    cv=5,
    n_jobs=-1,
    scoring='accuracy'
               )

grid_result = Gs.fit(X_train, y_train)

best_params = grid_result.best_params_
best_score = grid_result.best_score_
hyper_para(best_params,best_score,res)
                           

res.close()

# Final Test 

In [13]:
#Training
model=SVC(C= 10, gamma=0.1, kernel='rbf')
model.fit(X_train,y_train)

#Predicting 
pred=model.predict(X_test)


#Evaluation de l'algorithme

print(accuracy_score(y_test,pred))


0.8924752874722615


In [14]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

         0.0       0.51      0.24      0.33       290
         1.0       0.92      0.95      0.94      3832
         2.0       0.83      0.84      0.83       835

    accuracy                           0.89      4957
   macro avg       0.75      0.68      0.70      4957
weighted avg       0.88      0.89      0.88      4957



In [15]:
#Training
model=RandomForestClassifier(max_features='auto',min_samples_split=5,n_estimators=200)
model.fit(X_train,y_train)

#Predicting 
pred=model.predict(X_test)


#Evaluation de l'algorithme

print(accuracy_score(y_test,pred))
print(classification_report(y_test,pred))

0.8872301795440791
              precision    recall  f1-score   support

         0.0       0.41      0.13      0.20       290
         1.0       0.91      0.96      0.93      3832
         2.0       0.82      0.83      0.83       835

    accuracy                           0.89      4957
   macro avg       0.72      0.64      0.65      4957
weighted avg       0.87      0.89      0.87      4957

