In [2]:
import numpy as np
import pandas as pd
import sklearn 

In [3]:
#import data

#training data
train = pd.read_csv("twitter_train.csv")
#testing data
test = pd.read_csv("twitter_test.csv")



In [4]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [5]:
test.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17197 entries, 0 to 17196
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      17197 non-null  int64 
 1   tweet   17197 non-null  object
dtypes: int64(1), object(1)
memory usage: 268.8+ KB


In [8]:
# non racist related tweets

sum(train["label"] == 0)

#this data count indicates that there are no improper or profanity words used in the tweet



29720

In [9]:
# racist tweets
sum(train["label"] == 1)
#this data indicates that there are improper or profanity words used in the tweet

2242

In [19]:
# check if there are any missing values in the data
train.isnull().sum()
#there are no missing values in this dataset 

id             0
label          0
tweet          0
clean_tweet    0
dtype: int64

In [11]:
#data cleaning
#we will install tweet-preprocessor to clean the tweets


In [12]:
import re

In [13]:
REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def clean_tweets(df):
    tempArr = []
    for line in df:
        # Remove URLs, mentions, hashtags
        line = re.sub(r"http\S+|@\S+|#\S+", "", line)
        # Lowercase and strip punctuation
        line = REPLACE_NO_SPACE.sub("", line.lower())
        line = REPLACE_WITH_SPACE.sub(" ", line)
        tempArr.append(line)
    return tempArr

In [14]:
#cleaning training data
train_tweet = clean_tweets(train['tweet'])
train_tweet = pd.DataFrame(train_tweet)

In [15]:
train_tweet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       31962 non-null  object
dtypes: object(1)
memory usage: 249.8+ KB


In [16]:
#append clean tweets in the training data
train['clean_tweet'] = train_tweet

In [17]:
train.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for credit i cant use cause they don...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,i love u take with u all the time in urð±...
4,5,0,factsguide: society now #motivation,factsguide society now


In [18]:
#clean the test data and append the cleaned tweets in the test data
test_tweet = clean_tweets(test['tweet'])
test_tweet = pd.DataFrame(test_tweet)
test['clean_tweet'] = test_tweet

#compare the clean and unclean tweets
test.tail()

Unnamed: 0,id,tweet,clean_tweet
17192,49155,thought factory: left-right polarisation! #tru...,thought factory left right polarisation ...
17193,49156,feeling like a mermaid ð #hairflip #neverre...,feeling like a mermaid ð â¦
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...,today in &amp used words like assets&amplia...
17195,49158,"happy, at work conference: right mindset leads...",happy at work conference right mindset leads t...
17196,49159,"my song ""so glad"" free download! #shoegaze ...",my song so glad free download


In [21]:
from sklearn.model_selection import train_test_split
#extract the labels from the train data
Y = train.label.values

# use 70% training and 30% for testing
x_train, x_test, y_train, y_test = train_test_split(train.clean_tweet.values, Y, test_size = 0.3, stratify = Y, random_state = 1, shuffle = True)  

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
documents = ["This is import data's youtube channel", 
             "Data science is my passion and it is fun",
             "Please subscribe to my channel"]

#initialising the count vectorizer 
vectorizer = CountVectorizer()

#tokenize and convert the document to a matrix 
document_term_matrix = vectorizer.fit_transform(documents)

#checking the result 
pd.DataFrame(document_term_matrix.toarray(), columns = vectorizer.get_feature_names_out())

Unnamed: 0,and,channel,data,fun,import,is,it,my,passion,please,science,subscribe,this,to,youtube
0,0,1,1,0,1,1,0,0,0,0,0,0,1,0,1
1,1,0,1,1,0,2,1,1,1,0,1,0,0,0,0
2,0,1,0,0,0,0,0,1,0,1,0,1,0,1,0


In [25]:
#vectorize tweets for model building 
vectorizer = CountVectorizer(binary = True, stop_words = 'english')

#vocabulary dictionary of all tokens in the raw documents
vectorizer.fit(list(x_train) + list(x_test))

#transform the documents to matrix format 
x_train_vec = vectorizer.transform(x_train)
x_test_vec = vectorizer.transform(x_test)



In [26]:
#model building
from sklearn import svm

#classify usingg support vector classifier
svm = svm.SVC(kernel = "linear", probability = True)

#fit the SVC model based on the given training data 
prob = svm.fit(x_train_vec, y_train).predict_proba(x_test_vec)

#perfrom classification on prediction and samples on x_test
y_pred_svm = svm.predict(x_test_vec)

In [27]:
# accuracy score for svc 
from sklearn.metrics import accuracy_score
print("Accuracy score for SVC is : ", accuracy_score(y_test, y_pred_svm)*100, "%")


Accuracy score for SVC is :  94.96297841276463 %
