### Language Detection with the training data and set intersection
#### Accuracy - 0.92

In [1]:
import numpy as np
import re
import pandas as pd
from nltk import wordpunct_tokenize
import string

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
train_data = pd.read_csv("C:/path_for_train_dataset/dataset1.csv",skipinitialspace=True)
test_data = pd.read_csv("C:/path_for_test_dataset/dataset1_final_eval.csv",skipinitialspace=True)

In [4]:
train_data.head()

Unnamed: 0,id,text,language
0,0,Otra fotito porque pues #amor ?? WEBSITE2018,SPA
1,1,@USER @USER タイタニック２号、２０２２年に旧航路で出航へ #旅行 #歴史...,JAPANESE
2,2,"Que o Eterno nos livre do mal, ilumine nosso c...",PORTUGUESE
3,3,@USER @USER The Beautiful #KajalAggarwal #...,ENGLISH
4,4,"@USER @USER This is my painting ""Gazing ...",EN


In [5]:
test_data.head()

Unnamed: 0,id,text
0,0,Code promo Madeleine-Mode -CHF 50.- sur toute ...
1,1,@USER @USER Die erste und grösste #Schne...
2,2,@USER @USER Naturgärten anlegen und plan...
3,3,GRACIAS! Maravilloso proyecto #insideoutmalaga...
4,4,@USER Hoffe deine Tarnung drüben fliegt niema...


In [6]:
test_data["language"] = np.nan

In [7]:
test_data.head()

Unnamed: 0,id,text,language
0,0,Code promo Madeleine-Mode -CHF 50.- sur toute ...,
1,1,@USER @USER Die erste und grösste #Schne...,
2,2,@USER @USER Naturgärten anlegen und plan...,
3,3,GRACIAS! Maravilloso proyecto #insideoutmalaga...,
4,4,@USER Hoffe deine Tarnung drüben fliegt niema...,


In [9]:
train_data.language.unique()

array(['SPA', 'JAPANESE', 'PORTUGUESE', 'ENGLISH', 'EN', 'GER', 'ITA',
       'GERMAN', 'ENG', 'FRA', 'POR', 'ITALIAN', 'PT', 'FR', 'JA', 'JAP',
       'FRENCH', 'SPANISH', 'IT', 'ES', 'DE'], dtype=object)

In [10]:
language_data = ['ENGLISH','FRENCH','GERMAN','PORTUGESE','SPANISH','JAPANESE','ITALIAN']

#### Normalising the langauge values in the training file to single language attributing word

In [12]:
train_data.loc[train_data.language.str.startswith("EN"), "language"] = "ENGLISH"
train_data.loc[train_data.language.str.startswith("FR"), "language"] = "FRENCH"
train_data.loc[train_data.language.str.startswith("GE"), "language"] = "GERMAN"
train_data.loc[train_data.language.str.startswith("PO"), "language"] = "PORTUGESE"
train_data.loc[train_data.language.str.startswith("SP"), "language"] = "SPANISH"
train_data.loc[train_data.language.str.startswith("JA"), "language"] = "JAPANESE"
train_data.loc[train_data.language.str.startswith("IT"), "language"] = "ITALIAN"
train_data.loc[train_data.language.str.startswith("PT"), "language"] = "PORTUGESE"
train_data.loc[train_data.language.str.startswith("ES"), "language"] = "SPANISH"
train_data.loc[train_data.language.str.startswith("DE"), "language"] = "GERMAN"

In [13]:
train_data.language.unique()

array(['SPANISH', 'JAPANESE', 'PORTUGESE', 'ENGLISH', 'GERMAN', 'ITALIAN',
       'FRENCH'], dtype=object)

#### Removed the nametags, hashtags, punctuations and smileys from the twitter feed for both train and test files

In [14]:
train_data["text"] = train_data["text"].str.replace("(@[A-Za-z0-9]+)","")
train_data["text"] = train_data["text"].str.replace("(#[A-Za-z0-9]+)","")
train_data["text"] = train_data["text"].str.replace("[{}]".format(string.punctuation), '')
train_data["text"] = train_data["text"].str.strip()
train_data["text"] = train_data["text"].str.replace("[^\w\s#@/:%.,_-]", "", flags=re.UNICODE)
train_data = train_data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)


test_data["text"] = test_data["text"].str.replace("(@[A-Za-z0-9]+)","")
test_data["text"] = test_data["text"].str.replace("(#[A-Za-z0-9]+)","")
test_data["text"] = test_data["text"].str.replace("[{}]".format(string.punctuation), '')
test_data["text"] = test_data["text"].str.strip()
test_data["text"] = test_data["text"].str.replace("[^\w\s#@/:%.,_-]", "", flags=re.UNICODE)
test_data = test_data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [15]:
train_data.head()

Unnamed: 0,id,text,language
0,0,Otra fotito porque pues WEBSITE2018,SPANISH
1,1,タイタニック２号２０２２年に旧航路で出航へ 旅行 歴史 WEBSITE2018,JAPANESE
2,2,Que o Eterno nos livre do mal ilumine nosso ca...,PORTUGESE
3,3,The Beautiful WEBSITE2018,ENGLISH
4,4,This is my painting Gazing Along The Hills You...,ENGLISH


In [16]:
test_data.head()

Unnamed: 0,id,text,language
0,0,Code promo MadeleineMode CHF 50 sur toute la b...,
1,1,Die erste und grösste der Schweiz gehört Armi...,
2,2,Naturgärten anlegen und planen Auch für das ö...,
3,3,GRACIAS Maravilloso proyecto que ha llegado a...,
4,4,Hoffe deine Tarnung drüben fliegt niemals auf,


#### Split the words in the training file of twitter feed to list of words

In [17]:
train_data["string"] = train_data["text"].str.split(" ")

#### From the training dataset, create list of words attributed to different languages to help in test data detection 

In [18]:
english = pd.Series(train_data.string.where(train_data["language"] == "ENGLISH")).dropna()
merge_english = english[:].values.tolist()
merge_english = [item for sublist in merge_english for item in sublist]

french = pd.Series(train_data.string.where(train_data["language"] == "FRENCH")).dropna()
merge_french = french[:].values.tolist()
merge_french = [item for sublist in merge_french for item in sublist]

german = pd.Series(train_data.string.where(train_data["language"] == "GERMAN")).dropna()
merge_german = german[:].values.tolist()
merge_german = [item for sublist in merge_german for item in sublist]

italian = pd.Series(train_data.string.where(train_data["language"] == "ITALIAN")).dropna()
merge_italian = italian[:].values.tolist()
merge_italian = [item for sublist in merge_italian for item in sublist]

portugese = pd.Series(train_data.string.where(train_data["language"] == "PORTUGESE")).dropna()
merge_portugese = portugese[:].values.tolist()
merge_portugese = [item for sublist in merge_portugese for item in sublist] 

spanish = pd.Series(train_data.string.where(train_data["language"] == "SPANISH")).dropna()
merge_spanish = spanish[:].values.tolist()
merge_spanish = [item for sublist in merge_spanish for item in sublist] 

japanese = pd.Series(train_data.string.where(train_data["language"] == "JAPANESE")).dropna()
merge_japanese = japanese[:].values.tolist()
merge_japanese = [item for sublist in merge_japanese for item in sublist] 

In [19]:
most_rated_language = string

#### Tokenize each twitter feed from the test data and find an intersection with the set of identified words attributed to particular language in [18] command.

#### Then find the ratio of words of each language in the twitter feed, then attribute the tweet to the language with highest ratio

In [20]:
for index, row in test_data.iterrows():
     
    f = wordpunct_tokenize(row["text"])
    words = (word.lower() for word in f)
    language_ratio = {}

    language = {"ENGLISH" : merge_english, "FRENCH" : merge_french, "GERMAN" : merge_german, "ITALIAN" : merge_italian, "PORTUGESE" : merge_portugese, "SPANISH" : merge_spanish, "JAPANESE" : merge_japanese}

    stopwordSet_english = set(merge_english)
    stopwordSet_french = set(merge_french)
    stopwordSet_german = set(merge_german)
    stopwordSet_italian = set(merge_italian)
    stopwordSet_portugese = set(merge_portugese)
    stopwordSet_spanish = set(merge_spanish)
    stopwordSet_japanese = set(merge_japanese)

    word_set = set(words)

    common_elements_english = word_set.intersection(stopwordSet_english)
    common_elements_french = word_set.intersection(stopwordSet_french)
    common_elements_german = word_set.intersection(stopwordSet_german)
    common_elements_italian = word_set.intersection(stopwordSet_italian)
    common_elements_portugese = word_set.intersection(stopwordSet_portugese)
    common_elements_spanish = word_set.intersection(stopwordSet_spanish)
    common_elements_japanese = word_set.intersection(stopwordSet_japanese)

    language_ratio["ENGLISH"] = len(common_elements_english)
    language_ratio["FRENCH"] = len(common_elements_french)
    language_ratio["GERMAN"] = len(common_elements_german)
    language_ratio["ITALIAN"] = len(common_elements_italian)
    language_ratio["PORTUGESE"] = len(common_elements_portugese)
    language_ratio["SPANISH"] = len(common_elements_spanish)
    language_ratio["JAPANESE"] = len(common_elements_japanese)

    most_rated_language = str( max(language_ratio, key = language_ratio.get))
    #print(most_rated_language)
    test_data.language.iloc[index] = most_rated_language

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


#### Test Data with identified language with the help of training data

In [21]:
test_data.head()

Unnamed: 0,id,text,language
0,0,Code promo MadeleineMode CHF 50 sur toute la b...,FRENCH
1,1,Die erste und grösste der Schweiz gehört Armi...,GERMAN
2,2,Naturgärten anlegen und planen Auch für das ö...,GERMAN
3,3,GRACIAS Maravilloso proyecto que ha llegado a...,SPANISH
4,4,Hoffe deine Tarnung drüben fliegt niemals auf,GERMAN


In [22]:
result = test_data[["id","language"]]
result["label"] = np.nan

In [23]:
result.head()

Unnamed: 0,id,language,label
0,0,FRENCH,
1,1,GERMAN,
2,2,GERMAN,
3,3,SPANISH,
4,4,GERMAN,


In [24]:
result.loc[result.language.isin(["ENGLISH"]), "label"] = 1
result.loc[result.language.isin(["FRENCH"]), "label"] = 2
result.loc[result.language.isin(["GERMAN"]), "label"] = 3
result.loc[result.language.isin(["ITALIAN"]), "label"] = 4
result.loc[result.language.isin(["PORTUGESE"]), "label"] = 5
result.loc[result.language.isin(["SPANISH"]), "label"] = 6
result.loc[result.language.isin(["JAPANESE"]), "label"] = 7

del result["language"]
result.head()

Unnamed: 0,id,label
0,0,2.0
1,1,3.0
2,2,3.0
3,3,6.0
4,4,3.0


Required format of test data

In [25]:
result.label = result.label.astype(int)
result.head()

Unnamed: 0,id,label
0,0,2
1,1,3
2,2,3
3,3,6
4,4,3


In [None]:
result.to_csv("C:/Users/tripa/OneDrive/Desktop/AIB Datathon/langauge_final.csv", index = False)