# TURKISH TEXT CLASSIFICATION

### Kaggle Requirements

In [None]:
!pip install -q kaggle
from google.colab import files
files.upload()

In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c dogus-teknoloji-n11
!unzip train_n11.csv.zip -d sample_data
!unzip test_n11.csv.zip -d sample_data

### Importing Fasttext 

In [None]:
!git clone https://github.com/facebookresearch/fastText.git
!cd fastText
!pip install fastText

import fasttext

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tr.300.vec.gz
!gzip -d cc.tr.300.vec.gz

### Libraries

In [None]:
import re
import gc
import nltk
import string
from bs4 import BeautifulSoup


import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

### Dataset Preprocessing

In [None]:
df_train = pd.read_csv("sample_data/train_n11.csv", sep='|')
df_test = pd.read_csv("sample_data/test_n11.csv", sep='|')
df_sub = pd.read_csv("sample_submission_n11.csv")

In [None]:
print(df_train.CATEGORY_ID.unique())

[1000365 1001377 1001527 1000420 1000958 1000271 1003197 1000037 1001693
 1000264 1001676 1000476 1000038 1000402 1000426 1001660 1000830 1000180
 1000262 1000361 1000352 1000191 1000258 1000173 1000024 1000479 1001679
 1000393 1185232 1000436 1000834 1001580 1000186 1182207 1001522 1001524
 1002510 1001410 1000950 1002599 1000185 1002547 1000208 1000956 1000561
 1000184 1000354 1000967 1000454 1000833 1000542 1137102 1001479 1000953
 1002512 1003389 1000259 1001384 1000835 1002509 1000353 1002479 1001429
 1187203 1000197 1000457 1106103 1153150 1001426 1185202 1000263 1000324
 1000446 1003382 1000340 1000363 1000474 1000543 1001412 1000957 1106104
 1001653 1001669 1000347 1182215 1000490 1001395 1001576 1001399 1000205
 1000368 1000380 1000395 1140100 1000190 1000350 1000538 1001421 1000202
 1001662 1000394 1002603 1000400 1000013 1001452 1001506 1000383 1106101
 1000200 1174203 1137103 1002600 1000286 1000422 1002589 1000440 1000007
 1001573 1001441 1001428 1002478 1024100 1000396 11

In [None]:
df_train['labels'] = pd.factorize(df_train.CATEGORY_ID)[0]
df_train.head()

Unnamed: 0,ID,TITLE,DESCRIPTION,CATEGORY_ID,labels
0,46715,Sonia SN-X60 Kulak Üstü Oyuncu Kulaklık,<style>\n .uni-content {\n font-fami...,1000365,0
1,77151,ABC Power Sıvı Bulaşık Deterjanı 1370 G,<style>\n .uni-content {\n font-fami...,1001377,1
2,91464,Einhell TE-CD 18/2 Li 18V 2.5 Ah Li-ion Akülü ...,<style>\n .uni-content {\n font-fami...,1001527,2
3,53057,Arzum AR4095 Olimpia Smart Cyclone 899 W Filtr...,"<meta name=""viewport"" content=""width=device-wi...",1000420,3
4,73951,Pedigree Biftekli ve Kümes Hayvanlı Yetişkin K...,<style>\n .uni-content {\n font-fami...,1000958,4


In [None]:
# TO LOWERCASE
df_train['TITLE'] = df_train['TITLE'].str.lower()
df_test['TITLE'] = df_test['TITLE'].str.lower()


# REMOVING PUNCTUATIONS
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)


df_train['TITLE']= df_train['TITLE'].apply(lambda x: cleaning_punctuations(x))
df_test['TITLE']= df_test['TITLE'].apply(lambda x: cleaning_punctuations(x))



# CLEANING NUMERIC DATA
def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)

df_train['TITLE'] = df_train['TITLE'].apply(lambda x: cleaning_numbers(x))
df_test['TITLE'] = df_test['TITLE'].apply(lambda x: cleaning_numbers(x))

In [None]:
train, test = train_test_split(df_train, test_size=0.2, random_state=42)

## FASTTEXT

In [None]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [None]:
train["label_format"]=0
for i in range(len(train)):
    train.label_format[i]="__label__"+str(train.CATEGORY_ID[i])+" "+str(train.TITLE[i])

test["label_format"]=0
for i in range(len(test)):
    test.label_format[i]="__label__"+str(test.CATEGORY_ID[i])+" "+str(test.TITLE[i])

In [None]:
train.label_format.to_csv('fasttext_train.txt',index=None,header=None)
test.label_format.to_csv('fasttext_test.txt',index=None,header=None)

### Training & Testing the model

In [None]:
model = fasttext.train_supervised('fasttext_train.txt',epoch=100,lr=0.05,label_prefix='__label__',dim=300)

In [None]:
result = model.test('fasttext_train.txt')
validation = model.test('fasttext_test.txt')


# DISPLAY ACCURACY OF TRAINED MODEL
text_line =  "accuracy:" + str(result[1])  + ",validation:" + str(validation[1]) + '\n' 
print(text_line)

accuracy:0.9957829116244958,validation:0.98197152111471



### Submission

In [None]:
predictions=[]
for line in df_test['TITLE']:
    pred_label=model.predict(line) 
    predictions.append(pred_label)

column1 = [i[0] for i in predictions]

In [None]:
column2 = []
for x in column1:
  numeric_string = re.sub("[^0-9]", "", str(x))
  column2.append(numeric_string)

In [None]:
df_test['label1'] = column2
df_test['label1'] = df_test['label1'].astype(int)

df_sub.drop('Expected', axis = 1, inplace = True)
df_sub['Expected'] = df_test['label1']

In [None]:
df_sub.to_csv('Submissionft1.csv', encoding='utf-8',index= False )