### Step 01 - Reading Data

Import modules

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/data \nlp

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/data nlp


In [2]:
import pandas as pd
import nltk

Read Dataset (i saved a cleaned version)

In [3]:
import pandas as pd
import os

encodings = ['utf-8', 'ISO-8859-1', 'cp1252', 'utf-8']

# Loop through each encoding and attempt to read the CSV file
for encoding in encodings:
    try:
        df = pd.read_csv('cleaned.csv', encoding=encoding)
        print("CSV file read successfully with encoding:", encoding)
        break  # Exit the loop if successful
    except UnicodeDecodeError:
        print("Error reading CSV file with encoding:", encoding)
df

CSV file read successfully with encoding: utf-8


Unnamed: 0,X1,X6
0,0,switchfoot awww bummer shoulda got david carr ...
1,0,upset updat facebook text might cri result sch...
2,0,kenichan dive mani time ball manag save rest g...
3,0,whole bodi feel itchi like fire
4,0,nationwideclass behav mad see
...,...,...
1599995,4,woke school best feel ever
1599996,4,thewdb com cool hear old walt interview
1599997,4,readi mojo makeov ask detail
1599998,4,happi th birthday boo alll time tupac amaru sh...


In [4]:
df.head(5)

Unnamed: 0,X1,X6
0,0,switchfoot awww bummer shoulda got david carr ...
1,0,upset updat facebook text might cri result sch...
2,0,kenichan dive mani time ball manag save rest g...
3,0,whole bodi feel itchi like fire
4,0,nationwideclass behav mad see


In [5]:
df.shape

(1600000, 2)

## Step 02 - Preprocessing

Preprocessing Tasks:

*   Lower case
*   Tokenization
*   Removing specail characters and urls
*   Removing stop word punctuation
*   Stemming

In [6]:
# from unicodedata import normalize
# import re
# from nltk.corpus import stopwords
# from nltk.stem import PorterStemmer

# def analyser(x):
#     x = str(x)

#     x = re.sub(r'<[^>]+>', '', x) # Suppression de balises HTML

#     x = re.sub(r'http\S+', '', x) # Supression des liens (URLs)

#     x = normalize('NFC', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')

#     x = x.lower() # Minuscule

#     x = re.sub("[^a-z]", " ", x) # Suppression des non-mots (ponctuation)

#     x = x.split()

#     stop_words = set(stopwords.words('english'))
#     x = [word for word in x if word not in stop_words]  #Suppression des mots vide


#     stemmer = PorterStemmer() # Radicalisation de mots
#     x = [stemmer.stem(word) for word in x]

#     return " ".join(x).strip()

# analyser('hysÂ·terÂ·iÂ·a [ hi stÃ©eree É™ ] - a state of extreme emotion.')   #Example

### Step 03 - Feature Extraction

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

df.X6 = df.X6.astype(str)
text_data = ' '.join(df['X6'])  # Concatenate all text data into a single string

# Tokenize the string into individual words
words = word_tokenize(text_data)

# Count the number of tokens (words)
total_words = len(words)

print("Total number of words:", total_words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Total number of words: 12106854


In [9]:
from nltk.parse.corenlp import transform
tf_vec = TfidfVectorizer(max_features=400)
X = tf_vec.fit_transform(df['X6']).toarray()

In [10]:
X.shape

(1600000, 400)

In [11]:
Y = df['X1'].values

### Step 04 - Learning

In [12]:
from scipy.sparse import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=2)

In [13]:
model = LogisticRegression()
model.fit(X_train,y_train)

In [14]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.71905625


In [15]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB(alpha = 0.5)
model.fit(X_train,y_train)

In [16]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.71398125
