In [1]:
import sklearn 
import numpy as np
import pandas as pd

#visualizations
import seaborn as sns 
import matplotlib.pyplot as plt

%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

#consistent size plots
from pylab import rcParams
rcParams['figure.figsize']=12,5
rcParams['axes.labelsize']=12
rcParams['xtick.labelsize']=12
rcParams['ytick.labelsize']=12

#handle warnings
import warnings
warnings.filterwarnings(action='ignore',category=DeprecationWarning)
warnings.filterwarnings(action='ignore',category=FutureWarning)

#display pandas dataframe columns 
pd.options.display.max_columns = None

#get reproducible results
np.random.seed(42)
#tf.random.set_seed(42)


# Load the sklearn stuff

# making text data numeric
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

# making numeric data comparable
from sklearn.preprocessing import StandardScaler

# experimental regime and pipeline
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

# machine learning algorithm
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

# evaluation metrics
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [2]:
data_path = 'olid-training-v1.0.tsv'
OLID_train = pd.read_csv(data_path, delimiter='\t', encoding='utf-8')
OLID_train.head()

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,


In [3]:
OLID_train.rename({'tweet': 'text', 'subtask_a': 'label'}, axis=1, inplace=True) # rename columns if necessary
OLID_train['target'] = OLID_train.label.map({'NOT':0, 'OFF':1}) # convert label to a numerical variable
OLID_train.drop(['subtask_b', 'subtask_c'], axis=1, inplace=True) # drop irrelevant columns
OLID_train.head()

Unnamed: 0,id,text,label,target
0,86426,@USER She should ask a few native Americans wh...,OFF,1
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,1
2,16820,Amazon is investigating Chinese employees who ...,NOT,0
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,1
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,0


In [4]:
OLID_train.isnull().sum()

id        0
text      0
label     0
target    0
dtype: int64

In [5]:
OLID_train.duplicated().sum()

0

In [6]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

nltk.download('stopwords')
eng_stops = set(stopwords.words('english'))

from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 

def clean(text):
    # remove all the special characters
    text = re.sub('[^a-zA-Z]', ' ',text) 
    # convert all letters to lower case
    words = text.lower().split()
    # remove stop words
    words = [w for w in words if not w in eng_stops]    
    # lemmatizer
    words = [lemmatizer.lemmatize(word) for word in words]
    # join all words back to text
    return (' '.join(words))

OLID_train['clean'] = OLID_train['text'].apply(lambda x: clean(x))
OLID_train.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\valer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\valer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,text,label,target,clean
0,86426,@USER She should ask a few native Americans wh...,OFF,1,user ask native american take
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,1,user user go home drunk user maga trump url
2,16820,Amazon is investigating Chinese employees who ...,NOT,0,amazon investigating chinese employee selling ...
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,1,user someone vetaken piece shit volcano
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,0,user user obama wanted liberal amp illegals mo...


In [7]:
#sentiment
import nltk
nltk.download(['vader_lexicon'])
from nltk.sentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()
OLID_train['compound'] = [analyzer.polarity_scores(x)['compound'] for x in OLID_train['text']]
OLID_train['neg'] = [analyzer.polarity_scores(x)['neg'] for x in OLID_train['text']]
OLID_train['neu'] = [analyzer.polarity_scores(x)['neu'] for x in OLID_train['text']]
OLID_train['pos'] = [analyzer.polarity_scores(x)['pos'] for x in OLID_train['text']]
OLID_train.head()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\valer\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,id,text,label,target,clean,compound,neg,neu,pos
0,86426,@USER She should ask a few native Americans wh...,OFF,1,user ask native american take,0.0,0.0,1.0,0.0
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,1,user user go home drunk user maga trump url,-0.5067,0.247,0.753,0.0
2,16820,Amazon is investigating Chinese employees who ...,NOT,0,amazon investigating chinese employee selling ...,0.34,0.0,0.88,0.12
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,1,user someone vetaken piece shit volcano,-0.5574,0.286,0.714,0.0
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,0,user user obama wanted liberal amp illegals mo...,0.0,0.0,1.0,0.0


In [8]:
!pip install alt-profanity-check
!pip install sklearn --upgrade



You should consider upgrading via the 'c:\users\valer\anaconda3\python.exe -m pip install --upgrade pip' command.




You should consider upgrading via the 'c:\users\valer\anaconda3\python.exe -m pip install --upgrade pip' command.


In [9]:
# profenity
from profanity_check import predict, predict_prob
OLID_train['prof'] = predict(OLID_train['text'])
OLID_train['prof_prob'] = predict_prob(OLID_train['text'])
OLID_train.head()

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


Unnamed: 0,id,text,label,target,clean,compound,neg,neu,pos,prof,prof_prob
0,86426,@USER She should ask a few native Americans wh...,OFF,1,user ask native american take,0.0,0.0,1.0,0.0,0,0.044216
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,1,user user go home drunk user maga trump url,-0.5067,0.247,0.753,0.0,0,0.062316
2,16820,Amazon is investigating Chinese employees who ...,NOT,0,amazon investigating chinese employee selling ...,0.34,0.0,0.88,0.12,0,0.098105
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,1,user someone vetaken piece shit volcano,-0.5574,0.286,0.714,0.0,1,0.982132
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,0,user user obama wanted liberal amp illegals mo...,0.0,0.0,1.0,0.0,0,0.140452


In [10]:
from sklearn.svm import LinearSVC # Linear gave better results than regular SVC (with rbf kernel)

clf = LinearSVC(random_state=42, class_weight='balanced', max_iter=1500)

X = OLID_train.drop(columns=['id', 'label', 'target'])
y = OLID_train.target

# Split data into training, dev and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.10, stratify=y, random_state=42)
# print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

scaler = StandardScaler()

tfidf_vect = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
#tfidf_char = TfidfVectorizer(ngram_range=(2, 2), analyzer='char', stop_words='english')
#count_vect = CountVectorizer(ngram_range=(1, 3), stop_words='english')

# We distribute the different columns to different preprocessing steps
preprocessor = make_column_transformer(
     (scaler, ['compound', 'neg', 'neu', 'pos']), # we scale numeric features
     (scaler, ['prof', 'prof_prob']),
     (tfidf_vect, 'clean'),
#     (tfidf_char, 'clean'),
    remainder = 'drop')

svm_pipe = make_pipeline(preprocessor, clf)

svm_pipe.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler-1',
                                                  StandardScaler(),
                                                  ['compound', 'neg', 'neu',
                                                   'pos']),
                                                 ('standardscaler-2',
                                                  StandardScaler(),
                                                  ['prof', 'prof_prob']),
                                                 ('tfidfvectorizer',
                                                  TfidfVectorizer(ngram_range=(1,
                                                                               2),
                                                                  stop_words='english'),
                                                  'clean')])),
                ('linearsvc',
                 LinearSVC(class_weight='balanced', max_ite

In [55]:
data_path = 'test_TEXTGAIN.csv'
test_IN = pd.read_csv(data_path, encoding='utf-8')
test_IN.head()

Unnamed: 0,id,tweet
0,0,"a few American personality camels, well adjust..."
1,1,@juniormufasa_ how come lukaku flop @ man uu ...
2,2,LUKAKU IS SUCH A NIGGER I HATE THIS APE
3,3,I beg you lot keep that porch monkey coon Shau...
4,4,TRY NOT TO GET HIGH AS THE MONKEY !!! ❌\r\n\r\...


In [56]:
test_IN.rename({'tweet': 'text'}, axis=1, inplace=True) # rename columns if necessary
# test_IN.drop(['subtask_b', 'subtask_c'], axis=1, inplace=True) # drop irrelevant columns
test_IN.head()

Unnamed: 0,id,text
0,0,"a few American personality camels, well adjust..."
1,1,@juniormufasa_ how come lukaku flop @ man uu ...
2,2,LUKAKU IS SUCH A NIGGER I HATE THIS APE
3,3,I beg you lot keep that porch monkey coon Shau...
4,4,TRY NOT TO GET HIGH AS THE MONKEY !!! ❌\r\n\r\...


In [57]:
test_IN.shape

(1276, 2)

In [58]:
test_IN['clean'] = test_IN['text'].apply(lambda x: clean(x))
test_IN.head()

Unnamed: 0,id,text,clean
0,0,"a few American personality camels, well adjust...",american personality camel well adjusted cours...
1,1,@juniormufasa_ how come lukaku flop @ man uu ...,juniormufasa come lukaku flop man uu de nigger...
2,2,LUKAKU IS SUCH A NIGGER I HATE THIS APE,lukaku nigger hate ape
3,3,I beg you lot keep that porch monkey coon Shau...,beg lot keep porch monkey coon shaun bailey ti...
4,4,TRY NOT TO GET HIGH AS THE MONKEY !!! ❌\r\n\r\...,try get high monkey cop hoodie k negotiable co...


In [59]:
test_IN['compound'] = [analyzer.polarity_scores(x)['compound'] for x in test_IN['text']]
test_IN['neg'] = [analyzer.polarity_scores(x)['neg'] for x in test_IN['text']]
test_IN['neu'] = [analyzer.polarity_scores(x)['neu'] for x in test_IN['text']]
test_IN['pos'] = [analyzer.polarity_scores(x)['pos'] for x in test_IN['text']]
test_IN.head()

Unnamed: 0,id,text,clean,compound,neg,neu,pos
0,0,"a few American personality camels, well adjust...",american personality camel well adjusted cours...,0.5994,0.0,0.88,0.12
1,1,@juniormufasa_ how come lukaku flop @ man uu ...,juniormufasa come lukaku flop man uu de nigger...,-0.7717,0.401,0.599,0.0
2,2,LUKAKU IS SUCH A NIGGER I HATE THIS APE,lukaku nigger hate ape,-0.8402,0.615,0.385,0.0
3,3,I beg you lot keep that porch monkey coon Shau...,beg lot keep porch monkey coon shaun bailey ti...,0.3182,0.0,0.85,0.15
4,4,TRY NOT TO GET HIGH AS THE MONKEY !!! ❌\r\n\r\...,try get high monkey cop hoodie k negotiable co...,0.0,0.0,1.0,0.0


In [60]:
test_IN['prof'] = predict(test_IN['text'])
test_IN['prof_prob'] = predict_prob(test_IN['text'])
test_IN.head()

Unnamed: 0,id,text,clean,compound,neg,neu,pos,prof,prof_prob
0,0,"a few American personality camels, well adjust...",american personality camel well adjusted cours...,0.5994,0.0,0.88,0.12,0,0.13296
1,1,@juniormufasa_ how come lukaku flop @ man uu ...,juniormufasa come lukaku flop man uu de nigger...,-0.7717,0.401,0.599,0.0,1,0.978385
2,2,LUKAKU IS SUCH A NIGGER I HATE THIS APE,lukaku nigger hate ape,-0.8402,0.615,0.385,0.0,1,0.979777
3,3,I beg you lot keep that porch monkey coon Shau...,beg lot keep porch monkey coon shaun bailey ti...,0.3182,0.0,0.85,0.15,1,0.589529
4,4,TRY NOT TO GET HIGH AS THE MONKEY !!! ❌\r\n\r\...,try get high monkey cop hoodie k negotiable co...,0.0,0.0,1.0,0.0,0,0.167179


In [61]:
test_IN['label'] = svm_pipe.predict(test_IN)

In [62]:
test_IN.head()

Unnamed: 0,id,text,clean,compound,neg,neu,pos,prof,prof_prob,label
0,0,"a few American personality camels, well adjust...",american personality camel well adjusted cours...,0.5994,0.0,0.88,0.12,0,0.13296,0
1,1,@juniormufasa_ how come lukaku flop @ man uu ...,juniormufasa come lukaku flop man uu de nigger...,-0.7717,0.401,0.599,0.0,1,0.978385,1
2,2,LUKAKU IS SUCH A NIGGER I HATE THIS APE,lukaku nigger hate ape,-0.8402,0.615,0.385,0.0,1,0.979777,1
3,3,I beg you lot keep that porch monkey coon Shau...,beg lot keep porch monkey coon shaun bailey ti...,0.3182,0.0,0.85,0.15,1,0.589529,0
4,4,TRY NOT TO GET HIGH AS THE MONKEY !!! ❌\r\n\r\...,try get high monkey cop hoodie k negotiable co...,0.0,0.0,1.0,0.0,0,0.167179,0


In [63]:
prediction = test_IN[['id', 'label']]

In [64]:
prediction.head()

Unnamed: 0,id,label
0,0,0
1,1,1
2,2,1
3,3,0
4,4,0


In [65]:
prediction['label'] = prediction.label.map({0 : 'NOT', 1: 'OFF'}) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction['label'] = prediction.label.map({0 : 'NOT', 1: 'OFF'})


In [66]:
prediction.head()

Unnamed: 0,id,label
0,0,NOT
1,1,OFF
2,2,OFF
3,3,NOT
4,4,NOT


In [67]:
prediction.shape

(1276, 2)

In [68]:
prediction.label.value_counts()

NOT    680
OFF    596
Name: label, dtype: int64

In [69]:
prediction.to_csv('test_TEXTGAIN_traditional_vale.csv', index=False)